# Modelling

1. Logistic Regression
2. LinearDiscriminantAnalysis
3. Support Vector Machines (kernel)
4. Naive Bayes
5. Nearest Neighbor 
6. Decision Tree 
7. Ada Boost
8. Random Forest
9. XGBoost

In [61]:
import pandas as pd
#from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

import plotly.offline as py
import plotly.graph_objs as go

In [62]:
X_train = pd.read_csv('data/modelling/train/X_train.csv')
y_train = pd.read_csv('data/modelling/train/y_train.csv')

downsampled_train = pd.read_csv('data/modelling/train/downsampled_train.csv')
upsampled_train = pd.read_csv('data/modelling/train/upsampled_train.csv')

X_test= pd.read_csv('data/modelling/test/X_test.csv')
y_test = pd.read_csv('data/modelling/test/y_test.csv')

X_val = pd.read_csv('data/modelling/validation/X_val.csv')
y_val = pd.read_csv('data/modelling/validation/y_val.csv')

In [63]:
def GetBasedModel():
    basedModels = []
    basedModels.append(('CART' , DecisionTreeClassifier()))
    basedModels.append(('AB'   , AdaBoostClassifier()))
    basedModels.append(('GBM'  , GradientBoostingClassifier()))
    basedModels.append(('RF'   , RandomForestClassifier()))
    basedModels.append(('ET'   , ExtraTreesClassifier()))

    
    return basedModels

def BasedLine2(X_train, y_train,models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds, random_state=12, shuffle=True)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
    return names, results

class PlotBoxR(object):
    
    
    def __Trace(self,nameOfFeature,value): 
    
        trace = go.Box(
            y=value,
            name = nameOfFeature,
            marker = dict(
                color = 'rgb(0, 128, 128)',
            )
        )
        return trace

    def PlotResult(self,names,results):
        
        data = []

        for i in range(len(names)):
            data.append(self.__Trace(names[i],results[i]))


        py.iplot(data)

In [56]:

train = pd.concat([X_train, y_train], axis=1)

# Treatment of categorical variables

# replacing sex with binary 0 and 1
train.replace('male', 0, inplace=True) # male mapped to 0
train.replace('female', 1, inplace=True) # female mapped to 1

# replacing race 
train.replace('European/Caucasian-American', 0, inplace=True) 
train.replace('Latino/Hispanic American', 1, inplace=True) 
train.replace('Asian/Pacific Islander/Asian-American', 2, inplace=True) 
train.replace('Other', 3, inplace=True) 
train.replace('Black/African American', 4, inplace=True) 
train.replace('?', 5, inplace=True) 

train_1 = train._get_numeric_data().dropna()

X_train_1 = train_1[['gender', 'age', 'age_o', 'race', 'race_o',
       'samerace', 'importance_same_race', 'importance_same_religion',
       'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence',
       'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests',
       'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o',
       'shared_interests_o', 'attractive_important', 'sincere_important',
       'intellicence_important', 'funny_important', 'ambtition_important',
       'shared_interests_important', 'attractive', 'sincere', 'intelligence',
       'funny', 'ambition', 'attractive_partner', 'sincere_partner',
       'intelligence_partner', 'funny_partner', 'ambition_partner',
       'shared_interests_partner', 'sports', 'tvsports', 'exercise', 'dining',
       'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv',
       'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga',
       'interests_correlate', 'expected_happy_with_sd_people',
       'expected_num_interested_in_me', 'expected_num_matches', 'like',
       'guess_prob_liked', 'met']]
y_train_1 = train_1['match']

In [57]:
models = GetBasedModel()
names,results = BasedLine2(X_train_1, y_train_1,models)
PlotBoxR().PlotResult(names,results)

CART: 0.788410 (0.038595)
AB: 0.834167 (0.033169)
GBM: 0.865108 (0.025550)
RF: 0.852731 (0.020921)
ET: 0.855201 (0.030274)


In [58]:
def ScoreDataFrame(names,results):
    def floatingDecimals(f_val, dec=3):
        prc = "{:."+str(dec)+"f}" 
    
        return float(prc.format(f_val))

    scores = []
    for r in results:
        scores.append(floatingDecimals(r.mean(),4))

    scoreDataFrame = pd.DataFrame({'Model':names, 'Score': scores})
    return scoreDataFrame

In [59]:
basedLineScore = ScoreDataFrame(names,results)
basedLineScore

Unnamed: 0,Model,Score
0,CART,0.7884
1,AB,0.8342
2,GBM,0.8651
3,RF,0.8527
4,ET,0.8552


## 6. Decision Tree 

## 7. Ada Boost

## 8. GradientBoostingClassifier

## 9. Random Forest

## 10. Extra Trees