In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
%cd ..

C:\Users\Govert\Documents\VG_covers


In [3]:
df = pd.read_csv('output/full_features.csv', index_col = 0)
df.head()

Unnamed: 0,id,face,median_h,median_s,median_v,std_h,std_s,std_v,binary,has_cars
0,1,0,15.0,0.888889,18.0,108.112625,0.412796,50.919083,1,False
1,100,0,63.333336,0.073298,154.0,55.796837,0.187915,80.551109,0,False
2,1000,0,40.0,0.452632,51.0,94.552986,0.260581,60.660095,0,False
3,1001,0,85.816071,0.829787,195.0,79.904427,0.315882,62.971611,0,False
4,10017,1,210.270279,0.3267,172.0,104.401772,0.184765,66.292381,0,False


In [4]:
feature_list = ['face','median_h','median_s','median_v','std_h','std_s','std_v','has_cars']
X = df[feature_list]
y = df['binary']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 1)
clf = RandomForestClassifier(max_features = 3, n_estimators = 300, 
                             max_depth = 5, class_weight = 'balanced', min_samples_leaf = 5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred, target_names=['Under 16','Above 16']))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    Under 16       0.84      0.67      0.75       287
    Above 16       0.63      0.82      0.71       195

    accuracy                           0.73       482
   macro avg       0.73      0.74      0.73       482
weighted avg       0.75      0.73      0.73       482

0.7282157676348547


In [6]:
feat_imp = list(clf.feature_importances_)
imp_df = pd.DataFrame(zip(feature_list, feat_imp))
imp_df.columns = ['feature','feature_importance']
display(imp_df.sort_values('feature_importance', ascending = False))

Unnamed: 0,feature,feature_importance
3,median_v,0.325429
6,std_v,0.169288
1,median_h,0.128211
5,std_s,0.115906
2,median_s,0.109551
4,std_h,0.095087
7,has_cars,0.045145
0,face,0.011384


In [7]:
scaler = StandardScaler()
_ = scaler.fit(X_train)
logreg = LogisticRegression()
logreg.fit(scaler.transform(X_train), y_train)

y_pred = logreg.predict(scaler.transform(X_test))
print(classification_report(y_test, y_pred, target_names=['Under 16','Above 16']))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    Under 16       0.70      0.83      0.76       287
    Above 16       0.66      0.48      0.56       195

    accuracy                           0.69       482
   macro avg       0.68      0.66      0.66       482
weighted avg       0.69      0.69      0.68       482

0.6908713692946058


In [8]:
coefs = list(logreg.coef_[0])
coef_df = pd.DataFrame(zip(feature_list, coefs))
coef_df.columns = ['feature','feature_coef']
display(coef_df.sort_values('feature_coef', ascending = False))

Unnamed: 0,feature,feature_coef
6,std_v,0.46323
0,face,0.172296
4,std_h,-0.183157
1,median_h,-0.187821
5,std_s,-0.232825
2,median_s,-0.266978
7,has_cars,-0.381182
3,median_v,-0.63828
