In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
from matplotlib import pyplot
import matplotlib as mpl
import re
from collections import Counter
from nltk.tag import pos_tag
import nltk

%matplotlib inline

In [4]:
def load_data(): 
    
    df_train = pd.read_csv('train.csv')
    df_test = pd.read_csv('test.csv')
    df_test['Survived'] = 3
    
    df = pd.concat([df_train,df_test], axis=0)
    df = df.reset_index()

    return df

df = load_data()
#df_y_test = pd.read_csv('gender_submission.csv')

In [5]:
col_names = [ 'PassengerId', 'Pclass','Name','Sex','Age', 'SibSp', 'Parch', 'Ticket',  'Fare', 'Cabin', 'Embarked' ]
embarked_list = df.groupby('Embarked')['Embarked'].count().index.tolist()

In [7]:
len(df[(pd.isnull(df['Age']))&(df['Survived']==3)])

86

In [12]:

def clean_age_fare(df):
    
#    dfp = df.pivot_table('Fare', index=['Pclass', 'Sex'] , aggfunc='mean')
    
    dfp = df.pivot_table(['Fare', 'Age'], index=['Pclass', 'Sex'] , aggfunc='mean')
#    print (dfp)
    
#    sub_class = [ x for x in dfp.index.get_level_values('Pclass')]
#    sub_sex = [ x for x in dfp.index.get_level_values('Sex')]

#    print (dfp['Age'].loc[(dfp.index.get_level_values('Pclass')==1) & (dfp.index.get_level_values('Sex')=='male')].values)

    for i in range(len(df['Fare'])):
        
        if pd.isnull(df['Fare'].iloc[i]):
            df['Fare'].iloc[i] = float(dfp['Fare'].loc[(dfp.index.get_level_values('Pclass')==df['Pclass'].iloc[i]) & (dfp.index.get_level_values('Sex')==df['Sex'].iloc[i])].values)
        if pd.isnull(df['Age'].iloc[i]):
            df['Age'].iloc[i] = float(dfp['Age'].loc[(dfp.index.get_level_values('Pclass')==df['Pclass'].iloc[i]) & (dfp.index.get_level_values('Sex')==df['Sex'].iloc[i])].values)
            
    
    return df


In [17]:

def clean_data(df):
    
    ## Clean Data 
    ## 'Age', 'Fare', 'Cabin', 'Embarked' 
    
    ## df['Age'] fill NaN to mean
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    ## df['Cabin'] fill NaN to 'Unknown'
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    tmp_str = df['Cabin'].str.findall(r'[a-zA-Z]+').tolist()
    df['Cabin_deck']=[x[0] for x in tmp_str]
    tmp_str = df['Cabin'].str.findall(r'[0-9]+')
    df['Cabin_num'] = [x[0] if len(x) >0 else 0 for x in tmp_str]
    df['Cabin_count'] = [len(x) for x in tmp_str]
    
    ## df['Embarked'] fill NaN to 'C' <= Mode of class 1 & female 
    embarked_fill = df[(df['Sex']=='female')&(df['Pclass']==1)]['Embarked'].mode()[0]
    df['Embarked']=df['Embarked'].fillna(embarked_fill)
    
    ## df['Fare'] == 0 to NaN 
    df['Fare'] = df['Fare'].map(lambda x: np.nan if x==0 else x)
    
    df = clean_age_fare(df)
#    df_pivot_fare_fill = df.pivot_table('Fare', index=['Pclass', 'Sex'] , aggfunc='mean')
#    df['Fare'] = df[['Fare', 'Pclass','Sex']].apply \
#    (lambda x: df_pivot_fare_fill[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )
    
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']
    
   
    
    def substrings_in_string(big_string, substrings):
        import string
        for substring in substrings:
    #        if string.find(big_string, substring) != -1:
            if big_string.find(substring) != -1:
                return substring
        print (big_string)
        return np.nan
    
    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))

    def replace_titles(x):
        title=x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title =='Dr':
            if x['Sex']=='Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title
    
    df['Title']=df.apply(replace_titles, axis=1)
    
    return df

df=clean_data(df)

In [73]:
def add_feature(df):
    df['Family_Size']=df['SibSp']+df['Parch']
    
    return df
df = add_feature(df)

In [18]:
def change_categ_to_numeric(df):
    
    df["Embarked"] = df["Embarked"].astype('category')
    df["Cabin_deck"] = df["Cabin_deck"].astype('category')
    df["Title"] = df["Title"].astype('category')
    df["Sex"] = df["Sex"].astype('category')

    
    df['nEmbark'] = df["Embarked"].cat.codes    
    df['nCabin_deck'] = df["Cabin_deck"].cat.codes
    df['nTitle'] = df["Title"].cat.codes
    df['nSex'] = df["Sex"].cat.codes
    
    return (df)
    
df =change_categ_to_numeric(df)

In [88]:
def get_ml_data(df):
    from sklearn.model_selection import train_test_split
    
    train_col_name = [ 'Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'nCabin_deck', 'nTitle', 'nSex', 'Family_Size', 'nEmbark'] #, 'Cabin_count' ]
    label_col_name = ['Survived']

    X = df[train_col_name][(df['Survived']<3)]
    y = (np.array(df[label_col_name][(df['Survived']<3)]))
    y = y.reshape(len(y),)
    X_submit = df[train_col_name][(df['Survived']==3)]
    
    
    
    X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size = 0.2, stratify=y,random_state=0)
    
    print ("Train data len : %.0f"%len(X_train),"Valid data len : %.0f"%len(X_valid))
    
    return X_train, y_train, X_valid, y_valid, X_submit


In [89]:

def decision_tree(df):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.tree import export_graphviz
    
    X_train, y_train, X_valid, y_valid, X_submit = get_ml_data(df)

    depth = [2,3,4,5,6]
    
    print ("#### Decision Tree ####")
    
    for d in depth:
        clf = DecisionTreeClassifier(max_depth=d, random_state=0)
        clf.fit(X_train, y_train)
        print ("------ Max Depth : %.0f"%d)
        print ("Train Score : %.3f"%(100*clf.score(X_train, y_train))+"%")
        print ("Valid Score : %.3f"%(100*clf.score(X_valid, y_valid))+"%")
        print ("Feature Importance", clf.feature_importances_)

decision_tree(df)

Train data len : 712 Valid data len : 179
#### Decision Tree ####
------ Max Depth : 2
Train Score : 79.073%
Valid Score : 77.095%
Feature Importance [ 0.          0.          0.          0.17777106  0.          0.09502039
  0.          0.72720855  0.          0.        ]
------ Max Depth : 3
Train Score : 82.303%
Valid Score : 80.447%
Feature Importance [ 0.03462056  0.          0.          0.15382936  0.          0.08222331
  0.04854284  0.6292702   0.05151373  0.        ]
------ Max Depth : 4
Train Score : 85.253%
Valid Score : 82.123%
Feature Importance [ 0.05742275  0.02636171  0.00541733  0.13424867  0.06830038  0.07175724
  0.0423639   0.5491714   0.04495663  0.        ]
------ Max Depth : 5
Train Score : 86.236%
Valid Score : 81.006%
Feature Importance [ 0.0533755   0.08102838  0.00788896  0.12478659  0.06348644  0.07466794
  0.03937801  0.51046483  0.04492336  0.        ]
------ Max Depth : 6
Train Score : 87.781%
Valid Score : 77.654%
Feature Importance [ 0.08475644  0.092399

In [90]:
def SVM_Kernel_classification(df):
    from sklearn.svm import SVC
    from sklearn import preprocessing

    X_train, y_train, X_valid, y_valid, X_submit = get_ml_data(df)
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train) 
    X_valid_scaled = scaler.transform(X_valid)
    
    C = [0.1, 1, 10, 100,1000]
    G = [0.1, 1, 10]
    
    print ("#### SVM kernel Classifier ####")
    
    for c in C :
        g = 0.06
        clf = SVC(kernel='rbf', C=c, gamma=g).fit(X_train_scaled, y_train)

        print ("--- C Value is : %.3f"%c, 'Gamma value is : %.3f'%g)
        print ("Train Score : %.3f"%(100*clf.score(X_train_scaled, y_train))+"%")
        print ("Valid Score : %.3f"%(100*clf.score(X_valid_scaled, y_valid))+"%")

SVM_Kernel_classification(df)

Train data len : 712 Valid data len : 179
#### SVM kernel Classifier ####
--- C Value is : 0.100 Gamma value is : 0.060
Train Score : 83.708%
Valid Score : 81.006%
--- C Value is : 1.000 Gamma value is : 0.060
Train Score : 84.410%
Valid Score : 81.006%
--- C Value is : 10.000 Gamma value is : 0.060
Train Score : 86.376%
Valid Score : 79.888%
--- C Value is : 100.000 Gamma value is : 0.060
Train Score : 90.449%
Valid Score : 77.654%
--- C Value is : 1000.000 Gamma value is : 0.060
Train Score : 91.433%
Valid Score : 73.184%


In [75]:
def MLP_classification(df):
    from sklearn.neural_network import MLPClassifier
    from sklearn import preprocessing

    X_train, y_train, X_valid, y_valid, X_submit = get_ml_data(df)
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train) 
    X_valid_scaled = scaler.transform(X_valid)
    
    C = [0.1, 1, 10, 100]
    G = [0.1, 1, 10]
    layer = [[5], [10], [20,10], [ 30, 10],[100, 20], [100,50], [50,30,20],  [30,20,10], [ 30, 20, 10, 5]]
    
    print ("#### MLP Classifier ####")
    
    for l in layer : 
        clf = MLPClassifier(solver='lbfgs', activation = 'relu', random_state=0, hidden_layer_sizes=l).fit(X_train_scaled, y_train)
        print ("----- Layer : ", l)
        print ("Train Score : %.3f"%(100*clf.score(X_train_scaled, y_train))+"%")
        print ("Valid Score : %.3f"%(100*clf.score(X_valid_scaled, y_valid))+"%")

MLP_classification(df)

Train data len : 801 Valid data len : 90
#### MLP Classifier ####
----- Layer :  [5]
Train Score : 86.517%
Valid Score : 80.000%
----- Layer :  [10]
Train Score : 88.514%
Valid Score : 76.667%
----- Layer :  [20, 10]
Train Score : 92.509%
Valid Score : 83.333%
----- Layer :  [30, 10]
Train Score : 94.382%
Valid Score : 73.333%
----- Layer :  [100, 20]
Train Score : 93.883%
Valid Score : 75.556%
----- Layer :  [100, 50]
Train Score : 94.007%
Valid Score : 73.333%
----- Layer :  [50, 30, 20]
Train Score : 94.257%
Valid Score : 73.333%
----- Layer :  [30, 20, 10]
Train Score : 92.634%
Valid Score : 77.778%
----- Layer :  [30, 20, 10, 5]
Train Score : 90.762%
Valid Score : 72.222%


In [87]:
def RadndomForest_classification(df):
    from sklearn.ensemble import RandomForestClassifier

    X_train, y_train, X_valid, y_valid, X_submit = get_ml_data(df)
    
    est = [20, 50, 100]
    
    print ("#### Random Forest Classifier ####")
    
    for c in est :
        clf = RandomForestClassifier(n_estimators=c).fit(X_train, y_train)

        print ("--- Est Value is : %.3f"%c)
        print ("Train Score : %.3f"%(100*clf.score(X_train, y_train))+"%")
        print ("Valid Score : %.3f"%(100*clf.score(X_valid, y_valid))+"%")
        print ("Feature Importance : ", clf.feature_importances_)

    
#    print (X_train.head())
    X_train['Predict'] = (clf.predict(X_train))
    X_valid['Predict'] = (clf.predict(X_valid))
    
    X_train['Survived'] = y_train
    X_valid['Survived'] = y_valid
#    submit_predic = ([3]*len(X_submit))
    
#    predic = sum([train_predic,valid_predic,submit_predic],[]) 
    
#    df['Predict'] = predic
    
#    print (df.index[(df['Survived']!=df['Predict'])])
#    print (df.head())
    
    df_test = pd.read_csv('test.csv')
    df_test['Predict'] = clf.predict(X_submit)
    
    df_submit = df_test[['PassengerId','Predict']]
    
    print (df_submit.head())
    
    df_submit.to_csv('submition.csv')

    return X_train, X_valid
    
tr, val = RadndomForest_classification(df)


Train data len : 846 Valid data len : 45
#### Random Forest Classifier ####
--- Est Value is : 20.000
Train Score : 97.872%
Valid Score : 82.222%
Feature Importance :  [ 0.21040111  0.20004519  0.02419915  0.05767936  0.03527779  0.06658408
  0.12682175  0.19075063  0.06024896  0.02799198]
--- Est Value is : 50.000
Train Score : 98.700%
Valid Score : 75.556%
Feature Importance :  [ 0.21884333  0.20801535  0.01975119  0.07237376  0.02749271  0.06363446
  0.10215036  0.19857877  0.05486352  0.03429654]
--- Est Value is : 100.000
Train Score : 98.818%
Valid Score : 77.778%
Feature Importance :  [ 0.21506668  0.21152438  0.01887089  0.06287896  0.03350689  0.06767894
  0.10904229  0.19530512  0.05309948  0.03302639]
   PassengerId  Predict
0          892        0
1          893        0
2          894        0
3          895        1
4          896        0


In [77]:
tr.head()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,nCabin_deck,nTitle,nSex,Family_Size,nEmbark,Predict,Survived
890,32.0,7.75,0,3,0,8,2,1,0,1,0,0
473,23.0,13.7917,0,2,0,3,3,0,0,0,1,1
836,21.0,8.6625,0,3,0,8,2,1,0,2,0,0
329,16.0,57.9792,1,1,0,1,1,0,1,0,1,1
737,35.0,512.3292,0,1,0,1,2,1,0,0,1,1


In [81]:
#tr.index[(tr['Survived']!=tr['Predict'])]
tr[(tr['Survived']!=tr['Predict'])]

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,nCabin_deck,nTitle,nSex,Family_Size,nEmbark,Predict,Survived
400,39.0,7.925,0,3,0,8,2,1,0,2,0,1
579,32.0,7.925,0,3,0,8,2,1,0,2,0,1
107,25.962264,7.775,0,3,0,8,2,1,0,2,0,1
226,19.0,10.5,0,2,0,8,2,1,0,2,0,1
220,16.0,8.05,0,3,0,8,2,1,0,2,0,1
442,25.0,7.775,0,3,1,8,2,1,1,2,1,0
36,25.962264,7.2292,0,3,0,8,2,1,0,0,0,1
288,42.0,13.0,0,2,0,8,2,1,0,2,0,1
828,25.962264,7.75,0,3,0,8,2,1,0,1,0,1
264,22.185329,7.75,0,3,0,8,1,0,0,1,1,0


In [64]:
df.iloc[264]

index                         264
Age                       22.1853
Cabin                     Unknown
Embarked                        Q
Fare                         7.75
Name           Henry, Miss. Delia
Parch                           0
PassengerId                   265
Pclass                          3
Sex                        female
SibSp                           0
Survived                        0
Ticket                     382649
Cabin_deck                Unknown
Cabin_num                       0
Cabin_count                     0
Title                        Miss
nEmbark                         1
nCabin_deck                     8
nTitle                          1
nSex                            0
Predict                         1
Name: 264, dtype: object

In [82]:
val[(val['Survived']!=val['Predict'])]

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,nCabin_deck,nTitle,nSex,Family_Size,nEmbark,Predict,Survived
593,22.185329,7.75,2,3,0,8,1,0,2,1,1,0
328,31.0,20.525,1,3,1,8,3,0,2,2,0,1
114,17.0,14.4583,0,3,0,8,1,0,0,0,1,0
174,56.0,30.6958,0,1,0,0,2,1,0,0,1,0
137,37.0,53.1,0,1,1,2,2,1,1,2,1,0
283,19.0,8.05,0,3,0,8,2,1,0,2,0,1
569,32.0,7.8542,0,3,0,8,2,1,0,2,0,1
654,18.0,6.75,0,3,0,8,1,0,0,1,1,0
434,50.0,55.9,0,1,1,4,2,1,1,2,1,0
752,33.0,9.5,0,3,0,8,2,1,0,2,1,0
