In [86]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

### Joins the features extracted from the pics using Openface and the gender/ethnicity label table given by James. Then we do a test training on it.

In [95]:
OF_labels = pd.read_csv("./OF_featurelist/labels.csv", header=None, names=["drop", "id"])
OF_features = pd.read_csv("./OF_featurelist/reps.csv", header=None)
result = pd.concat([OF_labels, OF_features], axis=1)
result["id"] = result["id"].apply(lambda x: x.split("/")[-1][:-4]) 
result['id'] = result['id'].apply(pd.to_numeric)

In [96]:
eth_df = pd.read_csv("../Images/GenderEthnicityResources/binned_id_gender_ethnicity.csv")
joined = pd.merge(eth_df, result, how='right', on="id")

In [97]:
print("dropping ", joined["gender"].isnull().sum(), "NaNs")
joined =  joined.dropna()

dropping  881 NaNs


### Do the same for the Feret dataset features and make it the test set

In [98]:
OF_labels = pd.read_csv("./FERET_Dataset/labels.csv", header=None, names=["drop", "id"])
OF_features = pd.read_csv("./FERET_Dataset/reps.csv", header=None)
result = pd.concat([OF_labels, OF_features], axis=1)
feret_table = pd.read_pickle("./FERET_Dataset/gender_table")

result["id"] = result["id"].apply(lambda x: x.split("/")[-1][:-4]) 
feret_table["path"] = feret_table["path"].apply(lambda x: x.split("/")[-1][:-5])
feret_table.rename(inplace = True, columns={"path": "id"})

In [99]:
joined2 = pd.merge(feret_table, result, how='right', on="id")
joined2['race'].value_counts()

caucasian     5150
eastasian     1549
african        645
southasian     465
hispanic       447
other          117
Name: race, dtype: int64

### Joining the two tables, encoding classes and shuffling

In [114]:
together = pd.concat([joined2["race"], joined["ethnicity"]], axis=0)
one = pd.DataFrame(together.value_counts()).reset_index()
together = together.astype('category').cat.codes
two = pd.DataFrame(together.value_counts()).reset_index()
print(pd.merge(one, two, how='right', on=0))


y2 = together[0:joined2.shape[0]]
X2 = joined2.loc[:, "drop":].drop(['drop'], axis=1)
y = together[joined2.shape[0]:]
X = joined.loc[:, "drop":].drop(['drop'], axis=1)

X_train2,X_test2,y_train2,y_test2 = train_test_split(X2.index,y2,test_size=0.999999999)
X_train2 = X2.iloc[X_train2]
X_test2 = X2.iloc[X_test2]

X_train,X_test,y_train,y_test = train_test_split(X.index,y,test_size=0.0000000001)
X_train = X.iloc[X_train]
X_test = X.iloc[X_test]

      index_x      0  index_y
0   caucasian  13567        1
1    hispanic   3474        3
2   eastasian   2070        2
3  southasian   1359        5
4     african    711        0
5       other    117        4


### Train random forest using grid search

In [79]:
clf = RandomForestClassifier(n_estimators=40)
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 398.45 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.659 (std: 0.003)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 9, 'min_samples_split': 7}

Model with rank: 2
Mean validation score: 0.659 (std: 0.005)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 5, 'min_samples_split': 6}

Model with rank: 3
Mean validation score: 0.658 (std: 0.005)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 2, 'min_samples_split': 5}



In [88]:
predictions = random_search.predict(X_test2)
print(confusion_matrix(y_test2, predictions))
print(f1_score(y_test2, predictions, average='macro'))


[[   0  642    1    1    1    0]
 [   0 5141    2    6    1    0]
 [   0  969  543   12   25    0]
 [   0  416    2   22    7    0]
 [   0  104   10    3    0    0]
 [   0  423    0    6   36    0]]
0.234070511585


  'precision', 'predicted', average, warn_for)


In [4]:
from sklearn.metrics.cluster import v_measure_score
print("%.6f" % v_measure_score([0, 0, 1, 1], [5, 5, 2, 2]))

1.000000
