In [141]:
import numpy as np
import pandas as pd
import os

# read data
DATA_PATH = "../new_data/Train/"
image_data = pd.read_csv(os.path.join(DATA_PATH, "Image", "oxford.csv"))
profile_data = pd.read_csv(os.path.join(DATA_PATH, "Profile", "Profile.csv"))

### Prepare training data
1. Choose userIds whose labels are available in profile data
1. Filter out rows from oxford data with null values
1. Choose which face to train with for the given userId

In [142]:
filtered_images = image_data[image_data.userId.isin(profile_data.userid)]

In [143]:
image_data.userId.size, filtered_images.userId.size, profile_data.userid.size

(7915, 7915, 9500)

In [144]:
non_empty = filtered_images.dropna()

In [145]:
summary = non_empty.describe()

In [146]:
# iterate and find columns where min, max and mean are all same
irrelevant_columns = []
for c in summary.columns:
    col_stats = summary[c]
    if col_stats['min'] == col_stats['mean'] or col_stats['max'] == col_stats['mean']:
        irrelevant_columns.append(c)

In [147]:
non_empty.drop(labels=irrelevant_columns, axis=1, inplace=True)

In [188]:
X = []
y = []
id_col = ['userId', 'faceID']
for row in profile_data.iterrows():
    row = row[1]
    faces = non_empty[non_empty.userId == row.userid]
    if faces.size == 0:
        continue
    # randomly choose the first row
    X.append(faces.iloc[0].drop(labels=id_col))
    y.append(row.gender)

In [149]:
from sklearn.model_selection import train_test_split

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [151]:
X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test)

#### Standardize and Normalize data

In [191]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
def preprocess(df, scaler_type, scalers=None):
    used_scalers = {}
    ret_df = df.copy(deep=True)
    for c in df.columns:
        if c in ['userId', 'faceID']:
            continue
        if scalers is None:
            c_scaler = scaler_type()
            used_scalers[c] = c_scaler
            func = c_scaler.fit_transform
        else:
            c_scaler = scalers[c]
            func = c_scaler.transform
        ret_df[c] = func(df[[c]].values.astype(float))
    if scalers is None:
        return ret_df, used_scalers
    return ret_df, scalers

In [192]:
def preprocess_splits(X_train, X_test):
    # standardize train
    X_std_train, standard_scalers = preprocess(X_train, StandardScaler)
    # normalize train
    X_norm_train, norm_scalers = preprocess(X_std_train, MinMaxScaler)

    # standardize test
    X_std_test, _ = preprocess(X_test, StandardScaler, standard_scalers)
    # normalize test
    X_norm_test, _ = preprocess(X_std_test, MinMaxScaler, norm_scalers)
    
    return X_norm_train, X_norm_test
X_norm_train, X_norm_test = preprocess_splits(X_train, X_test)

### PCA

In [157]:
from sklearn.decomposition import PCA

In [168]:
pca = PCA(n_components=10)
pca_train_data = pca.fit_transform(X_norm_train)
pca_test_data = pca.transform(X_norm_test)

### Linear methods

In [174]:
from sklearn.metrics import accuracy_score

def eval_model(model_cls, model_params, train_data, test_data):
    model = model_cls(**model_params)

    # fit model
    model.fit(*train_data)

    # evaluate train perf
    train_pred = model.predict(train_data[0])
    train_acc = accuracy_score(train_data[1], train_pred)

    # test performance
    test_pred = model.predict(test_data[0])
    test_acc = accuracy_score(test_data[1], test_pred)
    
    return {
        'model': model,
        'train_acc': train_acc,
        'test_acc': test_acc
    }

In [209]:
class ModeModel(object):
    def __init__(self):
        self.ans=1
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return len(X) *[self.ans]
    def fit_predict(self, X, y=None):
        return self.transform(X)

In [211]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA, LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
methods = [ModeModel, QDA, LogisticRegression, LDA, SVC]
method_params = [{}, {},{'solver': 'newton-cg'}, {}, {'gamma': 'auto'}]
train_data = (X_norm_train, y_train)
test_data = (X_norm_test, y_test)
for m, m_params in zip(methods, method_params):
    ret = eval_model(m, m_params, train_data, test_data)
    print('Method: {} Train Acc {} Test Acc {}'.format(m.__name__, ret['train_acc'], ret['test_acc']))
    

Method: ModeModel Train Acc 0.5996592844974447 Test Acc 0.5704323570432357
Method: QuadraticDiscriminantAnalysis Train Acc 0.8266997057457023 Test Acc 0.8117154811715481
Method: LogisticRegression Train Acc 0.8280935418925197 Test Acc 0.8019525801952581
Method: LinearDiscriminantAnalysis Train Acc 0.8287130246244386 Test Acc 0.810320781032078
Method: SVC Train Acc 0.8039337153476847 Test Acc 0.7712691771269177


### K-fold validation

In [None]:
from sklearn.model_selection import KFold
X = np.array(X)
y = np.array(y)
kf = KFold(n_splits=10)
method_stats = {}
for train_index, test_index in kf.split(X):
    X_train, X_test = pd.DataFrame(X[train_index]), pd.DataFrame(X[test_index])
    y_train, y_test = y[train_index], y[test_index]
    X_train, X_test = preprocess_splits(X_train, X_test)

    train_data = (X_train, y_train)
    test_data = (X_test, y_test)
    for m, m_params in zip(methods, method_params):
        ret = eval_model(m, m_params, train_data, test_data)
        m_name = m.__name__
        if m_name not in method_stats:
            method_stats[m_name] = []
        method_stats[m_name].append([ret['train_acc'], ret['test_acc']])
for m_name, stats in method_stats.items():
    stats = np.array(stats)
    mean = np.mean(stats, axis=1)
    std = np.std(stats, axis=1)
    print('Method: {} Train Acc {}+={} Test Acc {}+={}'.format(m_name, mean[0], std[0], mean[1], std[1]))