Running Random Forest and SVM on Titanic Data Set

In [2]:
import pandas as pd
from sklearn import datasets, svm
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import sklearn.metrics as metrics

In [3]:
columns = ["PassengerID", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
titanic=pd.read_csv("train.csv", names=columns, sep=",", skiprows=[0],encoding="utf-8-sig")
#df = pd.DataFrame(titanic, columns=columns)
y=titanic["Survived"]
#df.drop('Survived', axis=1)

In [4]:
titanic['AgeNan']=titanic['Age']
titanic['AgeNan']=np.where(titanic['AgeNan'].between(0,81),0,titanic['AgeNan'])
titanic['AgeNan']=titanic['AgeNan'].fillna(1)

In [5]:
titanic['Has_Cabin'] = titanic["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

In [6]:
titanic['Sex'] = titanic['Sex'].map({'male':0, 'female':1})
titanic['Embarked']=titanic['Embarked'].fillna('S')
titanic['Embarked']=titanic['Embarked'].map({'Q':0, 'S':1, 'C':2})
titanic['Age']=titanic['Age'].fillna(29.69)

In [7]:
titanic['Parent']=np.nan
titanic['Parent']=np.where(titanic['Parch'].between(0,1),1,titanic['Parent'])
titanic['Parent']=np.where(titanic['Parch'].between(2,6),0,titanic['Parent'])

In [8]:
titanic['Child']=np.nan
titanic['Child']=np.where(titanic['Age'].between(0,17),1,titanic['Child'])
titanic['Child']=np.where(titanic['Age'].between(18,100),0,titanic['Child'])
        

In [9]:
titanic['FamilySize']=np.nan
titanic['FamilySize']= titanic['SibSp'] + titanic['Parch'] + 1
titanic['IsAlone']=np.nan
titanic['IsAlone']= 0
titanic.loc[titanic['FamilySize'] == 1, 'IsAlone'] = 1

In [10]:
titanic['Name']=titanic['Name'].replace(['.+?(?=Mrs)','.+?(?=Mr.)','Miss.','Master.','Rev.','Capt','Col','Don','Jonkheer','Major','Sir','Countess','Mme','Lady','Mlle','Ms','Dr'],[0,1,2,3,4,5,5,5,5,2,2,0,0,0,3,3,5], regex=True)

In [11]:
pd.Categorical(titanic['Sex'],categories=[0,1])
pd.Categorical(titanic['Embarked'],categories=[0,1,2])
pd.Categorical(titanic['Name'],categories=[0,1,2,3,4,5])
pd.Categorical(titanic['AgeNan'],categories=[0,1])
titanic['Parch'] = pd.Categorical(titanic.Parch).codes
titanic['SibSp'] = pd.Categorical(titanic.SibSp).codes
titanic['Parent'] = pd.Categorical(titanic.Parent).codes
titanic['Child'] = pd.Categorical(titanic.Child).codes
titanic['Fare']=pd.Categorical(titanic.Fare).codes
titanic['Has_Cabin']=pd.Categorical(titanic.Has_Cabin).codes


In [12]:
titanic.head(100)

Unnamed: 0,PassengerID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeNan,Has_Cabin,Parent,Child,FamilySize,IsAlone
0,1,0,3,1,0,22.00,1,0,A/5 21171,18,,1,0.0,0,1,0,2,0
1,2,1,1,0,1,38.00,1,0,PC 17599,207,C85,2,0.0,1,1,0,2,0
2,3,1,3,2,1,26.00,0,0,STON/O2. 3101282,41,,1,0.0,0,1,0,1,1
3,4,1,1,0,1,35.00,1,0,113803,189,C123,1,0.0,1,1,0,2,0
4,5,0,3,1,0,35.00,0,0,373450,43,,1,0.0,0,1,0,1,1
5,6,0,3,1,0,29.69,0,0,330877,51,,0,1.0,0,1,0,1,1
6,7,0,1,1,0,54.00,0,0,17463,186,E46,1,0.0,1,1,0,1,1
7,8,0,3,3,0,2.00,3,1,349909,124,,1,0.0,0,1,1,5,0
8,9,1,3,0,1,27.00,0,2,347742,74,,1,0.0,0,0,0,3,0
9,10,1,2,0,1,14.00,1,0,237736,154,,2,0.0,0,1,1,2,0


In [13]:
feature_list = ['Pclass','Sex','Embarked','Fare','Parch','SibSp','Child','Has_Cabin','Name','FamilySize']
X = titanic[feature_list]
Y = titanic['Survived']

In [24]:
[dftr,dftst] = train_test_split(titanic,test_size=0.25)

In [25]:
Xtr = dftr[feature_list]
Xtst = dftst[feature_list]
Ytr = dftr['Survived']
Ytst = dftst['Survived']

In [26]:
from sklearn.ensemble import RandomForestClassifier
RFmodel = RandomForestClassifier(n_estimators= 500,max_features=6)

In [27]:
RFmodel.fit(Xtr,Ytr)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
RFmodel.feature_importances_

array([0.05703753, 0.26779867, 0.03073546, 0.31342533, 0.0277325 ,
       0.03612607, 0.02080622, 0.04259225, 0.13767515, 0.06607081])

In [29]:
RF_prediction = RFmodel.predict(Xtst)
metrics.accuracy_score(Ytst,RF_prediction)

0.8295964125560538

In [30]:
from sklearn.model_selection import cross_val_score

In [31]:
scores = cross_val_score(RFmodel, X, Y, cv=100,scoring='accuracy')

In [32]:
scores

array([0.9       , 0.9       , 0.8       , 0.5       , 0.7       ,
       0.8       , 0.9       , 0.9       , 0.7       , 0.7       ,
       0.7       , 0.7       , 0.9       , 0.8       , 0.9       ,
       0.7       , 0.9       , 0.8       , 0.7       , 0.5       ,
       0.8       , 0.7       , 0.8       , 0.8       , 0.9       ,
       0.6       , 0.8       , 0.6       , 0.8       , 0.7       ,
       1.        , 0.8       , 0.8       , 1.        , 0.9       ,
       1.        , 0.9       , 0.9       , 0.8       , 0.9       ,
       0.8       , 0.8       , 0.77777778, 0.88888889, 0.88888889,
       0.66666667, 0.77777778, 0.77777778, 1.        , 0.875     ,
       0.875     , 0.625     , 0.875     , 1.        , 1.        ,
       0.875     , 0.875     , 0.625     , 0.75      , 0.625     ,
       0.875     , 1.        , 0.75      , 0.875     , 0.875     ,
       0.625     , 1.        , 0.875     , 0.875     , 0.875     ,
       0.875     , 0.875     , 0.625     , 0.5       , 0.75   

In [33]:
scores.mean()

0.8190277777777779

In [78]:
#clf=svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
#    max_iter=-1, probability=False, random_state=None, shrinking=True,
#    tol=0.001, verbose=False)
#clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [79]:
#examples=X_train.shape[0]
#Divide the sample into two: ones with positive classification, one with null classification
#pos = np.array([X_train[i] for i in range(examples) if y_train[i] == 1])
#neg = np.array([X_train[i] for i in range(examples) if y_train[i] == 0])

In [42]:
columns = ["PassengerID", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
test=pd.read_csv("test.csv", names=columns, sep=",", skiprows=[0],encoding="utf-8-sig")

In [43]:
test['AgeNan']=test['Age']
test['AgeNan']=np.where(test['AgeNan'].between(0,81),0,test['AgeNan'])
test['AgeNan']=test['AgeNan'].fillna(1)

In [44]:
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

In [45]:
test['Sex'] = test['Sex'].map({'male':0, 'female':1})
test['Embarked']=test['Embarked'].fillna('S')
test['Embarked']=test['Embarked'].map({'Q':0, 'S':1, 'C':2})
test['Age']=test['Age'].fillna(29.69)

In [46]:
test['Parent']=np.nan
test['Parent']=np.where(test['Parch'].between(0,1),1,test['Parent'])
test['Parent']=np.where(test['Parch'].between(2,6),0,test['Parent'])

In [47]:
test['Child']=np.nan
test['Child']=np.where(test['Age'].between(0,17),1,test['Child'])
test['Child']=np.where(test['Age'].between(18,100),0,test['Child'])
        

In [48]:
test.head()

Unnamed: 0,PassengerID,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeNan,Has_Cabin,Parent,Child
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,0,0.0,0,1.0,0.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,1,0.0,0,1.0,0.0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,0,0.0,0,1.0,0.0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,1,0.0,0,1.0,0.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,1,0.0,0,1.0,0.0


In [49]:
test['FamilySize']=np.nan
test['FamilySize']= test['SibSp'] + test['Parch'] + 1
test['IsAlone']=np.nan
test['IsAlone']= 0
test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

In [50]:
test['Name']=test['Name'].replace(['.+?(?=Mrs)','.+?(?=Mr.)','Miss.','Master.','Rev.','Capt','Col','Don','Jonkheer','Major','Sir','Countess','Mme','Lady','Mlle','Ms','Dr'],[0,1,2,3,4,5,5,5,5,2,2,0,0,0,3,3,5], regex=True)

In [51]:
pd.Categorical(test['Sex'],categories=[0,1])
pd.Categorical(test['Embarked'],categories=[0,1,2])
pd.Categorical(test['Name'],categories=[0,1,2,3,4,5])
pd.Categorical(test['AgeNan'],categories=[0,1])
test['Parch'] = pd.Categorical(test.Parch).codes
test['SibSp'] = pd.Categorical(test.SibSp).codes
test['Parent'] = pd.Categorical(test.Parent).codes
test['Child'] = pd.Categorical(test.Child).codes
test['Fare']=pd.Categorical(test.Fare).codes
test['Has_Cabin']=pd.Categorical(test.Has_Cabin).codes


In [52]:
feature_list = ['Pclass','Sex','Embarked','Fare','Parch','SibSp','Child','Has_Cabin','Name','FamilySize']
X = test[feature_list]

In [60]:
RF_prediction = RFmodel.predict(X)

In [61]:
RF_prediction

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [66]:
test['Survived']=np.nan
test['Survived']=RF_prediction

In [67]:
test.head()

Unnamed: 0,PassengerID,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeNan,Has_Cabin,Parent,Child,FamilySize,IsAlone,Survived
0,892,3,1,0,34.5,0,0,330911,24,,0,0.0,0,1,0,1,1,0
1,893,3,0,1,47.0,1,0,363272,5,,1,0.0,0,1,0,2,0,1
2,894,2,1,0,62.0,0,0,240276,41,,0,0.0,0,1,0,1,1,0
3,895,3,1,0,27.0,0,0,315154,34,,1,0.0,0,1,0,1,1,0
4,896,3,0,1,22.0,1,1,3101298,46,,1,0.0,0,1,0,3,0,0


In [65]:
RF_prediction.shape

(418,)

In [72]:
submit_list=['PassengerID','Survived']

In [73]:
final=test[submit_list]

In [74]:
final

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [78]:
final.to_csv('out.csv',index=False)