In [1]:
import numpy as np
import pandas as pd
import os

# read data
DATA_PATH = "../new_data/Train/"
image_data = pd.read_csv(os.path.join(DATA_PATH, "Image", "oxford.csv"))
profile_data = pd.read_csv(os.path.join(DATA_PATH, "Profile", "Profile.csv"))

### Prepare training data
1. Choose userIds whose labels are available in profile data
1. Filter out rows from oxford data with null values
1. Choose which face to train with for the given userId

In [2]:
filtered_images = image_data[image_data.userId.isin(profile_data.userid)]

In [3]:
image_data.userId.size, filtered_images.userId.size, profile_data.userid.size

(7915, 7915, 9500)

In [4]:
non_empty = filtered_images.dropna()

In [5]:
summary = non_empty.describe()

In [6]:
# iterate and find columns where min, max and mean are all same
irrelevant_columns = []
for c in summary.columns:
    col_stats = summary[c]
    if col_stats['min'] == col_stats['mean'] or col_stats['max'] == col_stats['mean']:
        irrelevant_columns.append(c)

In [7]:
# non_empty.drop(labels=irrelevant_columns, axis=1, inplace=True)

In [55]:
X = []
y = []
id_col = ['userId', 'faceID']
for row in profile_data.iterrows():
    row = row[1]
    faces = non_empty[non_empty.userId == row.userid]
    if faces.size == 0:
        continue
    # randomly choose the first row
    X.append(faces.iloc[0].drop(labels=id_col))
    y.append(row.gender)

In [51]:
faces.iloc[0].index

Index(['userId', 'faceID', 'faceRectangle_width', 'faceRectangle_height',
       'faceRectangle_left', 'faceRectangle_top', 'pupilLeft_x', 'pupilLeft_y',
       'pupilRight_x', 'pupilRight_y', 'noseTip_x', 'noseTip_y', 'mouthLeft_x',
       'mouthLeft_y', 'mouthRight_x', 'mouthRight_y', 'eyebrowLeftOuter_x',
       'eyebrowLeftOuter_y', 'eyebrowLeftInner_x', 'eyebrowLeftInner_y',
       'eyeLeftOuter_x', 'eyeLeftOuter_y', 'eyeLeftTop_x', 'eyeLeftTop_y',
       'eyeLeftBottom_x', 'eyeLeftBottom_y', 'eyeLeftInner_x',
       'eyeLeftInner_y', 'eyebrowRightInner_x', 'eyebrowRightInner_y',
       'eyebrowRightOuter_x', 'eyebrowRightOuter_y', 'eyeRightInner_x',
       'eyeRightInner_y', 'eyeRightTop_x', 'eyeRightTop_y', 'eyeRightBottom_x',
       'eyeRightBottom_y', 'eyeRightOuter_x', 'eyeRightOuter_y',
       'noseRootLeft_x', 'noseRootLeft_y', 'noseRootRight_x',
       'noseRootRight_y', 'noseLeftAlarTop_x', 'noseLeftAlarTop_y',
       'noseRightAlarTop_x', 'noseRightAlarTop_y', 'noseLeftA

In [52]:
 pd.Series([None]*len(image_data.columns), index=image_data.columns).index

Index(['userId', 'faceID', 'faceRectangle_width', 'faceRectangle_height',
       'faceRectangle_left', 'faceRectangle_top', 'pupilLeft_x', 'pupilLeft_y',
       'pupilRight_x', 'pupilRight_y', 'noseTip_x', 'noseTip_y', 'mouthLeft_x',
       'mouthLeft_y', 'mouthRight_x', 'mouthRight_y', 'eyebrowLeftOuter_x',
       'eyebrowLeftOuter_y', 'eyebrowLeftInner_x', 'eyebrowLeftInner_y',
       'eyeLeftOuter_x', 'eyeLeftOuter_y', 'eyeLeftTop_x', 'eyeLeftTop_y',
       'eyeLeftBottom_x', 'eyeLeftBottom_y', 'eyeLeftInner_x',
       'eyeLeftInner_y', 'eyebrowRightInner_x', 'eyebrowRightInner_y',
       'eyebrowRightOuter_x', 'eyebrowRightOuter_y', 'eyeRightInner_x',
       'eyeRightInner_y', 'eyeRightTop_x', 'eyeRightTop_y', 'eyeRightBottom_x',
       'eyeRightBottom_y', 'eyeRightOuter_x', 'eyeRightOuter_y',
       'noseRootLeft_x', 'noseRootLeft_y', 'noseRootRight_x',
       'noseRootRight_y', 'noseLeftAlarTop_x', 'noseLeftAlarTop_y',
       'noseRightAlarTop_x', 'noseRightAlarTop_y', 'noseLeftA

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [11]:
X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test)

#### Standardize and Normalize data

In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
def preprocess(df, scaler_type, scalers=None):
    used_scalers = {}
    ret_df = df.copy(deep=True)
    for c in df.columns:
        if c in ['userId', 'faceID']:
            continue
        if scalers is None:
            c_scaler = scaler_type()
            used_scalers[c] = c_scaler
            func = c_scaler.fit_transform
        else:
            c_scaler = scalers[c]
            func = c_scaler.transform
        ret_df[c] = func(df[[c]].values.astype(float))
    if scalers is None:
        return ret_df, used_scalers
    return ret_df, scalers

In [13]:
def preprocess_splits(X_train, X_test):
    # standardize train
    X_std_train, standard_scalers = preprocess(X_train, StandardScaler)
    # normalize train
    X_norm_train, norm_scalers = preprocess(X_std_train, MinMaxScaler)

    # standardize test
    X_std_test, _ = preprocess(X_test, StandardScaler, standard_scalers)
    # normalize test
    X_norm_test, _ = preprocess(X_std_test, MinMaxScaler, norm_scalers)
    
    return X_norm_train, X_norm_test
# X_norm_train, X_norm_test = preprocess_splits(X_train, X_test)

### PCA

In [14]:
from sklearn.decomposition import PCA

In [15]:
# pca = PCA(n_components=10)
# pca_train_data = pca.fit_transform(X_norm_train)
# pca_test_data = pca.transform(X_norm_test)

### Linear methods

In [16]:
from sklearn.metrics import accuracy_score

def eval_model(model_cls, model_params, train_data, test_data):
    model = model_cls(**model_params)

    # fit model
    model.fit(*train_data)

    # evaluate train perf
    train_pred = model.predict(train_data[0])
    train_acc = accuracy_score(train_data[1], train_pred)

    # test performance
    test_pred = model.predict(test_data[0])
    test_acc = accuracy_score(test_data[1], test_pred)
    
    return {
        'model': model,
        'train_acc': train_acc,
        'test_acc': test_acc
    }

In [17]:
class ModeModel(object):
    def __init__(self):
        self.ans=1
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return len(X) *[self.ans]
    def fit_predict(self, X, y=None):
        return self.transform(X)

In [18]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA, LinearDiscriminantAnalysis as LDA
# from sklearn.svm import SVC
# from sklearn.decomposition import KernelPCA
# from sklearn.neural_network import MLPClassifier

# methods = [ModeModel, QDA, LogisticRegression, LDA, SVC, ]
# method_params = [{}, {},{'solver': 'newton-cg'}, {}, {'gamma': 'auto'}]

# # pca = KernelPCA(n_components=20, kernel='rbf')
# # pca_train_data = pca.fit_transform(X_norm_train)
# # pca_test_data = pca.transform(X_norm_test)
# train_data = (X_norm_train, y_train)
# test_data = (X_norm_test, y_test)
# for m, m_params in zip(methods, method_params):
#     ret = eval_model(m, m_params, train_data, test_data)
#     print('Method: {} Train Acc {} Test Acc {}'.format(m.__name__, ret['train_acc'], ret['test_acc']))
    

In [19]:
class EnsembleClass(object):
    def __init__(self, methods, weights, threshold):
        self.methods = methods
        self.weights = weights
        self.threshold = threshold
    def fit(self, X, y=None):
        for m in self.methods:
            m.fit(X, y)
    def predict(self, X):
        ret = np.zeros(shape=(X.shape[0],))
        for i in range(len(self.methods)):
            m = self.methods[i]
            res = m.predict(X)
            ret = ret + self.weights[i] * res
        ret[ret  < self.threshold] = 0
        ret[ret >= self.threshold] = 1
        return ret

In [58]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
methods = [RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, ModeModel]
method_params = [{"n_estimators": 10}, {"n_estimators": 10}, {},
#                  {'methods': [
#                      RandomForestClassifier(n_estimators=10),
#                      ExtraTreesClassifier(n_estimators=10),
#                      GradientBoostingClassifier(n_estimators=200)],
#                   'weights': [1,1,2],
#                   'threshold': 3
#                  }]
# ,
                {'estimators': [
                     ('a', RandomForestClassifier(n_estimators=50)),
                     ('b', ExtraTreesClassifier(n_estimators=50)),
                     ('c', GradientBoostingClassifier(n_estimators=100))], 'weights': (1,1,2),'voting': 'hard', 'n_jobs': 4}
                ,{}]
#                  ,{'hidden_layer_sizes': (32, 16, 8, 4, 2), 'solver': 'adam', 'activation': 'relu'}]
                
# pca = PCA(n_components=63)
# pca_train_data = pca.fit_transform(X_norm_train)
# pca_test_data = pca.transform(X_norm_test)
# train_data = (X_norm_train, y_train)
# test_data = (X_norm_test, y_test)
# for m, m_params in zip(methods[-1:], method_params[-1:]):
#     ret = eval_model(m, m_params, train_data, test_data)
#     print('Method: {} Train Acc {} Test Acc {}'.format(m.__name__, ret['train_acc'], ret['test_acc']))


### K-fold validation

In [59]:
from sklearn.model_selection import KFold, StratifiedKFold
X = np.array(X)
y = np.array(y)
kf = StratifiedKFold(n_splits=10)
method_stats = {}
for train_index, test_index in kf.split(X, y):
    X_train, X_test = pd.DataFrame(X[train_index]), pd.DataFrame(X[test_index])
    y_train, y_test = y[train_index], y[test_index]
    X_train, X_test = preprocess_splits(X_train, X_test)

    train_data = (X_train, y_train)
    test_data = (X_test, y_test)
    for m, m_params in zip(methods[-1:], method_params[-1:]):
        ret = eval_model(m, m_params, train_data, test_data)
        m_name = m.__name__
        if m_name not in method_stats:
            method_stats[m_name] = []
        method_stats[m_name].append([ret['train_acc'], ret['test_acc']])
        
for m_name, stats in method_stats.items():
    stats = np.array(stats)
    mean = np.mean(stats, axis=1)
    std = np.std(stats, axis=1)
    print('Method: {} Train Acc {}+={} Test Acc {}+={}'.format(m_name, mean[0], std[0], mean[1], std[1]))

Method: ModeModel Train Acc 0.5967043692881822+=4.233869012926217e-05 Test Acc 0.596454724158046+=0.0003544456065140423


In [28]:
x =pd.DataFrame([[None]*len(image_data.columns)], columns=image_data.columns)

In [41]:
y = np.zeros(shape=(len(image_data),))
y[image_data.notnull().all(axis=1)] = 1

In [43]:
y.shape

(7915,)