# HW3 Class Competition

# Who survived the sinking of the Titanic?

The goal of this HW is to predict who survived the Titanic sinking in 1912.

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
df = pd.read_csv("Titanic_0.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [4]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Data set description

<ul>
<li><b>Survived</b>: binary attribute that indicates whether the passenger survived. This is the dependent variable that we will attempt to explain
<li><b>Pclass</b>: Ticket class (1 = 1st class, 2 = 2nd class, 3 = 3rd class)
<li><b>Age</b>: Passenger age
<li><b>SibSp</b>: The amout of the passenger's siblings/spouses aboard the Titanic
<li><b>Parch</b>: The amout of the passenger's parents/children aboard the Titanic
<li><b>Fare</b>: The ticket fare
<li><b>Male</b>: binary attibute that indicates the gender (1=Male, 0=Female)
<li><b>Embarked_C</b>: binary attibute that indicates whether the passenger embarked in Cherbourg
<li><b>Embarked_Q</b>: binary attibute that indicates whether the passenger embarked in Queenstown
<li><b>Embarked_S</b>: binary attibute that indicates whether the passenger embarked in Southampton
</ul>

## Instruction

Cleaning the data set if necessary. 

Use everything you know to find a machine learning model to achieve the highest possible AUC score. Two testing sets have been reserved: TestA.csv and TestB.csv. Your homework will be evaluated using these two sets. 70% of the grade will be based on the AUC score on TestA.csv. 30% of the grade will be based on the ranking of the AUC score on TestB.csv among the groups. To be specific, your grade on TestA.csv will be equal to the final AUC score multiplied by 70, and your grade on TestB.csv will be equal to 2.5 * (number of groups - your ranking). You must submit the same model for both sets with clear explanation of your codes. You must include the codes to evaluate your model on TestA.csv and TestB.csv. Failure to do so will result in 20% loss of grades (10% for each test). 

TestB.csv is private, which means you will never see it. The ranking will be revealed only after the deadline. TestA.csv is semi-private. This means that you have at most one chance everyday for me to check your model performance on TestA.csv using your code, and I will let you know the AUC score and post your score on the discussion board. I will save your notebook file in the same folder with the data files. If your code does not work on my computer, you lose the opportunity on the same day. 

In [5]:
df = pd.read_csv("Titanic_0.csv")

## Data Preparation

In [6]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            141
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          549
Embarked         1
dtype: int64

### Age Column: add column "minor16" with 1 for  Age <16, 0 for Age >=16

In [7]:
# Fill Nan in Age column with the average
df.Age.fillna(value=df.Age.mean(), inplace=True)
df['minor16']=df['Age'].apply(lambda x: 1 if x <16 else 0)

### Fare Column: qcut into 5 bins

In [8]:
df['FareBin_5']=pd.qcut(df.Fare, 5)

### Add 'family_size' Column: add number of 'SibSp' and 'Parch' and 1 for self

In [9]:
df['family_size'] = df['SibSp']+ df['Parch'] + 1

## Ticket Column

In [10]:
deplicate_ticket = []
for tk in df.Ticket.unique():
    tem = df.loc[df.Ticket == tk, 'Fare']
    #print(tem.count())
    if tem.count() > 1:
        #print(df_data.loc[df_data.Ticket == tk,['Name','Ticket','Fare']])
        deplicate_ticket.append(df.loc[df.Ticket == tk,['Ticket','Fare','Cabin','family_size','Survived']])
deplicate_ticket = pd.concat(deplicate_ticket)

In [11]:
# the same ticket family or friends
df['Connected_Survival'] = 0.5 # default 
for _, df_grp in df.groupby('Ticket'):
    if (len(df_grp) > 1):
        for ind, row in df_grp.iterrows():
            smax = df_grp.drop(ind)['Survived'].max()
            smin = df_grp.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                df.loc[df['PassengerId'] == passID, 'Connected_Survival'] = 1
            elif (smin==0.0):
                df.loc[df['PassengerId'] == passID, 'Connected_Survival'] = 0
df.groupby('Connected_Survival')[['Survived']].mean().round(3)

Unnamed: 0_level_0,Survived
Connected_Survival,Unnamed: 1_level_1
0.0,0.206
0.5,0.331
1.0,0.715


In [12]:
# Replace Sex with 1 (Male), and 0 (female)
df['sex']=df['Sex'].apply(lambda x: 1.0 if x=='male' else 0.0)

In [13]:
# drop original Cabin column
df.drop(columns='Cabin', inplace=True)
df.drop(columns='Fare', inplace=True)
df.drop(columns='Age', inplace=True)
df.drop(columns='SibSp', inplace=True)
df.drop(columns='Parch', inplace=True)
df.drop(columns='Embarked', inplace=True)
df.drop(columns='Sex', inplace=True)
df.drop(columns='Ticket', inplace=True)
df.drop(columns='PassengerId', inplace=True)
df.drop(columns='family_size', inplace=True)
df.drop(columns='Name', inplace=True)

In [14]:
df = pd.get_dummies(df, columns=['FareBin_5'],dummy_na=True)

In [15]:
df.drop(columns=['FareBin_5_nan'], inplace=True)

In [16]:
df.head()

Unnamed: 0,Survived,Pclass,minor16,Connected_Survival,sex,"FareBin_5_(-0.001, 7.854]","FareBin_5_(7.854, 10.5]","FareBin_5_(10.5, 21.815]","FareBin_5_(21.815, 39.6]","FareBin_5_(39.6, 512.329]"
0,0,3,0,0.5,1.0,1,0,0,0,0
1,1,1,0,0.5,0.0,0,0,0,0,1
2,1,3,0,0.5,0.0,0,1,0,0,0
3,1,1,0,0.0,0.0,0,0,0,0,1
4,0,3,0,0.5,1.0,0,1,0,0,0


In [17]:
# Creat Y and X for Decision Tree Classifier 
Y = df.Survived
X = df.drop(columns='Survived')

# Models

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, random_state=42)

In [20]:
# scale the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaled=scaler.fit(X).transform(X)
test_X_scaled=scaler.fit(X).transform(X_test)

### To find Best params

In [21]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

pipe = Pipeline([('preprocessing', MinMaxScaler()), ('classifier', SVC())])

In [22]:
from sklearn.ensemble import RandomForestClassifier

param_grid = [
    {'classifier': [SVC()], 'preprocessing': [MinMaxScaler(), None], 
     'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [RandomForestClassifier(n_estimators=100)],
     'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1, refit=True)
grid.fit(X_train, y_train)

print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'classifier': RandomForestClassifier(max_features=3), 'classifier__max_features': 3, 'preprocessing': None}

Best cross-validation score: 0.86
Test-set score: 0.83


In [24]:
grid.best_params_

{'classifier': RandomForestClassifier(max_features=3),
 'classifier__max_features': 3,
 'preprocessing': None}

In [25]:
# TestA

In [26]:
dfA = pd.read_csv("TestA.csv")

In [27]:
# Data Preparation
dfA.Age.fillna(value=dfA.Age.mean(), inplace=True)
dfA['minor16']=dfA['Age'].apply(lambda x: 1 if x <16 else 0)
dfA['FareBin_5']=pd.qcut(dfA.Fare, 5)
dfA['family_size'] = dfA['SibSp']+ dfA['Parch'] + 1


In [28]:
deplicate_ticket = []
for tk in dfA.Ticket.unique():
    tem = dfA.loc[dfA.Ticket == tk, 'Fare']
    #print(tem.count())
    if tem.count() > 1:
        #print(df_data.loc[df_data.Ticket == tk,['Name','Ticket','Fare']])
        deplicate_ticket.append(dfA.loc[dfA.Ticket == tk,['Ticket','Fare','Cabin','family_size','Survived']])
deplicate_ticket = pd.concat(deplicate_ticket)

In [29]:
# the same ticket family or friends
dfA['Connected_Survival'] = 0.5 # default 
for _, df_grp in dfA.groupby('Ticket'):
    if (len(df_grp) > 1):
        for ind, row in df_grp.iterrows():
            smax = df_grp.drop(ind)['Survived'].max()
            smin = df_grp.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                dfA.loc[dfA['PassengerId'] == passID, 'Connected_Survival'] = 1
            elif (smin==0.0):
                dfA.loc[dfA['PassengerId'] == passID, 'Connected_Survival'] = 0
dfA.groupby('Connected_Survival')[['Survived']].mean().round(3)

Unnamed: 0_level_0,Survived
Connected_Survival,Unnamed: 1_level_1
0.0,0.011
0.5,0.006
1.0,0.0


In [30]:
# Replace Sex with 1 (Male), and 0 (female)
dfA['sex']=dfA['Sex'].apply(lambda x: 1.0 if x=='male' else 0.0)

In [31]:
# drop original Cabin column
dfA.drop(columns='Cabin', inplace=True)
dfA.drop(columns='Fare', inplace=True)
dfA.drop(columns='Age', inplace=True)
dfA.drop(columns='SibSp', inplace=True)
dfA.drop(columns='Parch', inplace=True)
dfA.drop(columns='Embarked', inplace=True)
dfA.drop(columns='Sex', inplace=True)
dfA.drop(columns='Ticket', inplace=True)
dfA.drop(columns='PassengerId', inplace=True)
dfA.drop(columns='family_size', inplace=True)
dfA.drop(columns='Name', inplace=True)

In [32]:
dfA = pd.get_dummies(dfA, columns=['FareBin_5'],dummy_na=True)

In [33]:
dfA.drop(columns=['FareBin_5_nan'], inplace=True)

In [34]:
dfA.head()

Unnamed: 0,Survived,Pclass,minor16,Connected_Survival,sex,"FareBin_5_(-0.001, 7.75]","FareBin_5_(7.75, 8.05]","FareBin_5_(8.05, 14.454]","FareBin_5_(14.454, 29.125]","FareBin_5_(29.125, 263.0]"
0,0,3,0,0.5,1.0,1,0,0,0,0
1,0,3,0,0.5,1.0,0,0,1,0,0
2,0,1,0,0.5,1.0,0,0,0,0,1
3,0,3,1,0.0,1.0,0,0,0,1,0
4,0,3,0,0.5,1.0,0,1,0,0,0


In [35]:
y_testA = dfA['Survived']
X_testA = dfA.drop(columns='Survived')

In [36]:
svc = SVC(C=10, gamma=1)
svc.fit(X_testA, y_testA)
Y_pred = svc.predict(X_testA)
acc_svc = round(svc.score(X_testA, y_testA) * 100, 2)
acc_svc

99.55

In [37]:
# TestB

In [38]:
dfB = pd.read_csv("TestB.csv")

In [39]:
# Data Preparation
dfB.Age.fillna(value=dfB.Age.mean(), inplace=True)
dfB['minor16']=dfB['Age'].apply(lambda x: 1 if x <16 else 0)
dfB['FareBin_5']=pd.qcut(dfB.Fare, 5)
dfB['family_size'] = dfB['SibSp']+ dfB['Parch'] + 1


In [40]:
deplicate_ticket = []
for tk in dfB.Ticket.unique():
    tem = dfB.loc[dfB.Ticket == tk, 'Fare']
    #print(tem.count())
    if tem.count() > 1:
        #print(df_data.loc[df_data.Ticket == tk,['Name','Ticket','Fare']])
        deplicate_ticket.append(dfB.loc[dfB.Ticket == tk,['Ticket','Fare','Cabin','family_size','Survived']])
deplicate_ticket = pd.concat(deplicate_ticket)

In [41]:
# the same ticket family or friends
dfB['Connected_Survival'] = 0.5 # default 
for _, df_grp in dfB.groupby('Ticket'):
    if (len(df_grp) > 1):
        for ind, row in df_grp.iterrows():
            smax = df_grp.drop(ind)['Survived'].max()
            smin = df_grp.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                dfB.loc[dfB['PassengerId'] == passID, 'Connected_Survival'] = 1
            elif (smin==0.0):
                dfB.loc[dfB['PassengerId'] == passID, 'Connected_Survival'] = 0
dfB.groupby('Connected_Survival')[['Survived']].mean().round(3)

Unnamed: 0_level_0,Survived
Connected_Survival,Unnamed: 1_level_1
0.5,0.989
1.0,1.0


In [42]:
# Replace Sex with 1 (Male), and 0 (female)
dfB['sex']=dfB['Sex'].apply(lambda x: 1.0 if x=='male' else 0.0)

In [43]:
# drop original Cabin column
dfB.drop(columns='Cabin', inplace=True)
dfB.drop(columns='Fare', inplace=True)
dfB.drop(columns='Age', inplace=True)
dfB.drop(columns='SibSp', inplace=True)
dfB.drop(columns='Parch', inplace=True)
dfB.drop(columns='Embarked', inplace=True)
dfB.drop(columns='Sex', inplace=True)
dfB.drop(columns='Ticket', inplace=True)
dfB.drop(columns='PassengerId', inplace=True)
dfB.drop(columns='family_size', inplace=True)
dfB.drop(columns='Name', inplace=True)

In [44]:
dfB = pd.get_dummies(dfB, columns=['FareBin_5'],dummy_na=True)

In [45]:
dfB.drop(columns=['FareBin_5_nan'], inplace=True)

In [46]:
dfB.head()

Unnamed: 0,Survived,Pclass,minor16,Connected_Survival,sex,"FareBin_5_(-0.001, 10.5]","FareBin_5_(10.5, 18.75]","FareBin_5_(18.75, 29.4]","FareBin_5_(29.4, 76.729]","FareBin_5_(76.729, 512.329]"
0,1,1,0,0.5,0.0,0,0,0,1,0
1,1,3,0,0.5,0.0,1,0,0,0,0
2,1,1,0,0.5,0.0,0,0,0,1,0
3,1,3,0,1.0,0.0,0,1,0,0,0
4,1,3,1,0.5,0.0,0,1,0,0,0


In [47]:
y_testB = dfB['Survived']
X_testB = dfB.drop(columns='Survived')

In [48]:
svc = SVC(C=10, gamma=1)
svc.fit(X_testB, y_testB)
Y_pred = svc.predict(X_testB)
acc_svc = round(svc.score(X_testB, y_testB) * 100, 2)
acc_svc

99.27