## I'll just be using the Titanic data set for my classification assignment!

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../data/titanic.csv')

In [2]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
survived    891 non-null int64
pclass      891 non-null int64
name        891 non-null object
sex         891 non-null object
age         714 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
ticket      891 non-null object
fare        891 non-null float64
cabin       204 non-null object
embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


### First things first, cleaning up the data a bit

In [4]:
df.groupby(['pclass','sex'])['age'].median()

pclass  sex   
1       female    35.0
        male      40.0
2       female    28.0
        male      30.0
3       female    21.5
        male      25.0
Name: age, dtype: float64

In [5]:
df.groupby(['pclass','sex'])['age'].mean()

pclass  sex   
1       female    34.611765
        male      41.281386
2       female    28.722973
        male      30.740707
3       female    21.750000
        male      26.507589
Name: age, dtype: float64

#### Median and mean age within different sex/class groups are pretty different so I'll replace nulls with the median value

In [6]:
df['age'] = df.groupby(['pclass','sex'])['age'].apply(lambda x: x.fillna(x.median()))

In [7]:
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,0,3,"Moran, Mr. James",male,25.0,0,0,330877,8.4583,,Q
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


#### Cabin is null for the majority of data points so I will choose to disregard it and take it out of the data set

In [8]:
df = df.drop('cabin',axis=1)

#### Embarked has 2 null values so I'll just replace them with the most commonly occurring value

In [9]:
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [10]:
df.embarked = df.embarked.fillna('S')

In [11]:
df.embarked.value_counts()

S    646
C    168
Q     77
Name: embarked, dtype: int64

### I'm interested in creating a variable based on the title of the passenger

In [12]:
import re
names = df.name
title = []
for i in names:
    j = re.search(', (.*)\.', i)
    if j:
        title.append(j.group(0))

In [13]:
# My regex expression didn't do exactly what i wanted but I can just take out the characters I don't need
title

[', Mr.',
 ', Mrs.',
 ', Miss.',
 ', Mrs.',
 ', Mr.',
 ', Mr.',
 ', Mr.',
 ', Master.',
 ', Mrs.',
 ', Mrs.',
 ', Miss.',
 ', Miss.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Mrs.',
 ', Master.',
 ', Mr.',
 ', Mrs.',
 ', Mrs.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Miss.',
 ', Mrs.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Don.',
 ', Mrs.',
 ', Miss.',
 ', Mr.',
 ', Mr.',
 ', Mr.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Miss.',
 ', Mrs.',
 ', Mrs.',
 ', Mr.',
 ', Miss.',
 ', Miss.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Mrs.',
 ', Master.',
 ', Mr.',
 ', Mrs.',
 ', Mrs.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Miss.',
 ', Master.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Master.',
 ', Mr.',
 ', Master.',
 ', Mrs.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Mr.',
 ', Mr.',
 ', Mr.',
 ', Mr.',
 ', Mr.',
 ', Master.',
 ', Miss.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Miss.',
 ', Mrs.',
 ', Mr.',
 ', Mr.',
 ', Miss.',
 ', Mr.',
 ', Mr.',
 

In [14]:
title1 = []
for i in title:
    j = i[2:]
    title1.append(j)
title2 = []
for i in title1:
    j = i[:-1]
    title2.append(j)

In [15]:
title2

['Mr',
 'Mrs',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Mr',
 'Master',
 'Mrs',
 'Mrs',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mrs',
 'Master',
 'Mr',
 'Mrs',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Don',
 'Mrs',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Miss',
 'Mrs',
 'Mrs',
 'Mr',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mrs',
 'Master',
 'Mr',
 'Mrs',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Master',
 'Mr',
 'Miss',
 'Mr',
 'Master',
 'Mr',
 'Master',
 'Mrs',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Master',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mrs',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Master',
 'Mr',
 

In [16]:
df['title'] = title2

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
survived    891 non-null int64
pclass      891 non-null int64
name        891 non-null object
sex         891 non-null object
age         891 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
ticket      891 non-null object
fare        891 non-null float64
embarked    891 non-null object
title       891 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


In [18]:
df[df.title=='Mrs. Martin (Elizabeth L']

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,title
513,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54.0,1,0,PC 17603,59.4,C,Mrs. Martin (Elizabeth L


In [19]:
df.title.value_counts()

Mr                          517
Miss                        182
Mrs                         124
Master                       40
Dr                            7
Rev                           6
Mlle                          2
Col                           2
Major                         2
the Countess                  1
Sir                           1
Don                           1
Mme                           1
Ms                            1
Capt                          1
Mrs. Martin (Elizabeth L      1
Jonkheer                      1
Lady                          1
Name: title, dtype: int64

In [20]:
# I'm going to use my own judgment to group the rarer ones down further
df = df.replace({'title' : {'Master': 'Uppity','Dr':'Uppity','Rev':'Uppity','Mlle':'Uppity','Col':'Uppity'
                            ,'Major':'Uppity','Capt':'Uppity','Ms':'Miss','the Countess':'Uppity','Sir':'Uppity'
                            ,'Lady':'Uppity','Jonkheer':'Uppity','Mrs. Martin (Elizabeth L':'Mrs'
                            ,'Mme':'Uppity','Don':'Uppity'}})

In [21]:
df.title.value_counts()

Mr        517
Miss      183
Mrs       125
Uppity     66
Name: title, dtype: int64

In [22]:
df = df.replace({'sex':{'female':0,'male':1}
                ,'embarked':{'C':'Cherbourg','Q':'Queenstown','S':'Southampton'}})

In [23]:
df.groupby('pclass')['title'].value_counts()

pclass  title 
1       Mr        107
        Miss       46
        Mrs        42
        Uppity     21
2       Mr         91
        Mrs        41
        Miss       35
        Uppity     17
3       Mr        319
        Miss      102
        Mrs        42
        Uppity     28
Name: title, dtype: int64

In [24]:
df.groupby(['title','sex'])['survived'].value_counts()

title   sex  survived
Miss    0    1           128
             0            55
Mr      1    0           436
             1            81
Mrs     0    1            99
             0            26
Uppity  0    1             6
        1    0            32
             1            28
Name: survived, dtype: int64

#### Of the titles, it seems like uppity men survived at a much higher rate than normal men so I'll create a dummy variable that only looks at whether they were uppity people

In [25]:
u = []
t = df.title
for i in t:
    if i == 'Uppity':
        u.append(1)
    else:
        u.append(0)
df['uppity'] = u

### I'll create a family size variable

In [26]:
df['famsize'] = df['sibsp'] + df['parch']

### Women and children were prioritized to get off first (according to the movie) so I'll create a variable for children

In [27]:
c = []
a = df.age
for i in a:
    if i < 18:
        c.append(1)
    else:
        c.append(0)
df['child'] = c

In [36]:
df.groupby('child')['survived'].value_counts()
#it looks like being a child didn't really guarantee your survival....

child  survived
0      0           497
       1           281
1      1            61
       0            52
Name: survived, dtype: int64

### Let's explore the data  bit

In [41]:
df.groupby('embarked')['survived'].value_counts()

embarked     survived
Cherbourg    1            93
             0            75
Queenstown   0            47
             1            30
Southampton  0           427
             1           219
Name: survived, dtype: int64

In [67]:
df.groupby('fare')['survived'].value_counts()

fare      survived
0.0000    0           14
          1            1
4.0125    0            1
5.0000    0            1
6.2375    0            1
6.4375    0            1
6.4500    0            1
6.4958    0            2
6.7500    0            2
6.8583    0            1
6.9500    0            1
6.9750    0            1
          1            1
7.0458    0            1
7.0500    0            7
7.0542    0            2
7.1250    0            4
7.1417    1            1
7.2250    0            9
          1            3
7.2292    0           11
          1            4
7.2500    0           12
          1            1
7.3125    0            1
7.4958    0            2
          1            1
7.5208    0            1
7.5500    0            3
          1            1
                      ..
106.4250  0            1
          1            1
108.9000  0            1
          1            1
110.8833  1            3
          0            1
113.2750  1            2
          0            1
120.00

In [56]:
df.groupby('famsize')['survived'].value_counts()

famsize  survived
0        0           374
         1           163
1        1            89
         0            72
2        1            59
         0            43
3        1            21
         0             8
4        0            12
         1             3
5        0            19
         1             3
6        0             8
         1             4
7        0             6
10       0             7
Name: survived, dtype: int64

In [42]:
df.survived.value_counts()
#This means that my classifier should do better than a score of 0.62
#since that's the result if you guess that no one survived

0    549
1    342
Name: survived, dtype: int64

In [44]:
df.groupby('pclass')['survived'].value_counts()

pclass  survived
1       1           136
        0            80
2       0            97
        1            87
3       0           372
        1           119
Name: survived, dtype: int64

### I'm ready to start trying to classify!

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
survived    891 non-null int64
pclass      891 non-null int64
name        891 non-null object
sex         891 non-null int64
age         891 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
ticket      891 non-null object
fare        891 non-null float64
embarked    891 non-null object
title       891 non-null object
uppity      891 non-null int64
famsize     891 non-null int64
child       891 non-null int64
dtypes: float64(2), int64(8), object(4)
memory usage: 97.5+ KB


In [45]:
#dummy variable time
dfwd = pd.get_dummies(df,columns = ['embarked','title','pclass'])

In [46]:
dfwd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 21 columns):
survived                891 non-null int64
name                    891 non-null object
sex                     891 non-null int64
age                     891 non-null float64
sibsp                   891 non-null int64
parch                   891 non-null int64
ticket                  891 non-null object
fare                    891 non-null float64
uppity                  891 non-null int64
famsize                 891 non-null int64
child                   891 non-null int64
embarked_Cherbourg      891 non-null uint8
embarked_Queenstown     891 non-null uint8
embarked_Southampton    891 non-null uint8
title_Miss              891 non-null uint8
title_Mr                891 non-null uint8
title_Mrs               891 non-null uint8
title_Uppity            891 non-null uint8
pclass_1                891 non-null uint8
pclass_2                891 non-null uint8
pclass_3                891 n

#### Let's create a classifier with all available variables first to see how it does

In [123]:
#dropping dummy variables that either complete a set or overlap with other variables
#also dropping ticket and name because they're strings
X = dfwd.drop(['pclass_3','title_Uppity','title_Miss','embarked_Southampton','ticket','name','survived'],axis=1)
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [0.1,1,5,10,100]}
pipe = make_pipeline(LogisticRegression())
grid = GridSearchCV(pipe,param_grid=params,cv=5)

In [124]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': [0.1, 1, 5, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [125]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [126]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.90      0.90      0.90       140
          1       0.83      0.83      0.83        83

avg / total       0.87      0.87      0.87       223



In [127]:
print('Test set accuracy score: ',accuracy_score(y_test,pred), '\nTrain set accuracy score: ',accuracy_score(grid.predict(X_train),y_train))

Test set accuracy score:  0.874439461883408 
Train set accuracy score:  0.811377245508982


#### I take this classification report to mean that, for survivors, my model was better at not saying that someone who died had survived, but classified a lot of people who survived as people who had died

In [54]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)
#this already does better than our baseline of 63%

0.8116591928251121

In [55]:
accuracy_score(grid.predict(X_train),y_train)
#this score is also very similar to what it is for our test case! Model doesn't seem to be overfitting

0.8278443113772455

### This time I want to use the variables that my exploration and gut say would have an effect

In [66]:
dfwd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 21 columns):
survived                891 non-null int64
name                    891 non-null object
sex                     891 non-null int64
age                     891 non-null float64
sibsp                   891 non-null int64
parch                   891 non-null int64
ticket                  891 non-null object
fare                    891 non-null float64
uppity                  891 non-null int64
famsize                 891 non-null int64
child                   891 non-null int64
embarked_Cherbourg      891 non-null uint8
embarked_Queenstown     891 non-null uint8
embarked_Southampton    891 non-null uint8
title_Miss              891 non-null uint8
title_Mr                891 non-null uint8
title_Mrs               891 non-null uint8
title_Uppity            891 non-null uint8
pclass_1                891 non-null uint8
pclass_2                891 non-null uint8
pclass_3                891 n

In [69]:
X = dfwd[['sex','uppity','famsize','child','embarked_Cherbourg','pclass_1']]
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [0.1,1,5,10,100]}
pipe = make_pipeline(LogisticRegression())
grid = GridSearchCV(pipe,param_grid=params,cv=5)

In [70]:
grid.fit(X_train,y_train)
grid.best_estimator_
#interesting that this best estimator has a different C value!

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [71]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))
#this did worse in every way

             precision    recall  f1-score   support

          0       0.75      0.91      0.83       137
          1       0.79      0.52      0.63        86

avg / total       0.77      0.76      0.75       223



In [73]:
print('Test set accuracy score: ',accuracy_score(y_test,pred), '\nTrain set accuracy score: ',accuracy_score(grid.predict(X_train),y_train))

Test set accuracy score:  0.7623318385650224 
Train set accuracy score:  0.8038922155688623


### Let's see how it does with a penalty of l1 instead of the default of l2

In [74]:
X = dfwd[['sex','uppity','famsize','child','embarked_Cherbourg','pclass_1']]
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [0.1,1,5,10,100]}
pipe = make_pipeline(LogisticRegression(penalty='l1'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)

In [75]:
grid.fit(X_train,y_train)
grid.best_estimator_
#another different best estimator C value

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [76]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))
#better than same metrics with l2

             precision    recall  f1-score   support

          0       0.82      0.87      0.84       143
          1       0.74      0.65      0.69        80

avg / total       0.79      0.79      0.79       223



In [78]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[125,  18],
       [ 28,  52]])

In [79]:
print('Test set accuracy score: ',accuracy_score(y_test,pred), '\nTrain set accuracy score: ',accuracy_score(grid.predict(X_train),y_train))

Test set accuracy score:  0.7937219730941704 
Train set accuracy score:  0.8233532934131736


### What about all of the variables with penalty of l1?

In [128]:
X = dfwd.drop(['pclass_3','title_Uppity','title_Miss','embarked_Southampton','ticket','name','survived'],axis=1)
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [0.1,1,5,10,100]}
pipe = make_pipeline(LogisticRegression(penalty='l1'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)

In [129]:
grid.fit(X_train,y_train)
grid.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [130]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.83      0.87      0.85       142
          1       0.75      0.68      0.71        81

avg / total       0.80      0.80      0.80       223



In [131]:
print('Test set accuracy score: ',accuracy_score(y_test,pred), '\nTrain set accuracy score: ',accuracy_score(grid.predict(X_train),y_train))

Test set accuracy score:  0.8026905829596412 
Train set accuracy score:  0.8263473053892215


### Just playing around

In [94]:
X = dfwd[['sex','uppity','child','embarked_Cherbourg','pclass_1']]
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [0.1,1,5,10,100]}
pipe = make_pipeline(LogisticRegression(penalty='l1'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)
grid.fit(X_train,y_train)
grid.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [95]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.84      0.88      0.86       136
          1       0.80      0.74      0.77        87

avg / total       0.82      0.83      0.82       223



In [96]:
X = dfwd[['sex','uppity','child','embarked_Cherbourg','pclass_1']]
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [0.1,1,5,10,100]}
pipe = make_pipeline(LogisticRegression(penalty='l2'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)
grid.fit(X_train,y_train)
grid.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [97]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.81      0.83      0.82       138
          1       0.71      0.69      0.70        85

avg / total       0.77      0.78      0.78       223



### What if we add in polynomial features?

In [111]:
from sklearn.preprocessing import PolynomialFeatures
X = dfwd.drop(['pclass_3','title_Uppity','title_Miss','embarked_Southampton','ticket','name','survived'],axis=1)
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [0.1,1,5,10,100]
         ,'polynomialfeatures__degree':[i for i in range (1,5)]}
pipe = make_pipeline(PolynomialFeatures(),LogisticRegression(penalty='l1'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)
grid.fit(X_train,y_train)
grid.best_estimator_

Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False)), ('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [112]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.88      0.86      0.87       140
          1       0.77      0.81      0.79        83

avg / total       0.84      0.84      0.84       223



In [113]:
print('Test set accuracy score: ',accuracy_score(y_test,pred), '\nTrain set accuracy score: ',accuracy_score(grid.predict(X_train),y_train))

Test set accuracy score:  0.8385650224215246 
Train set accuracy score:  0.8173652694610778


In [120]:
from sklearn.preprocessing import PolynomialFeatures
X = dfwd.drop(['pclass_3','title_Uppity','title_Miss','embarked_Southampton','ticket','name','survived'],axis=1)
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [0.1,1,5,10,100]
         ,'polynomialfeatures__degree':[i for i in range (1,5)]}
pipe = make_pipeline(PolynomialFeatures(),LogisticRegression(penalty='l2'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)
grid.fit(X_train,y_train)
grid.best_estimator_

Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False)), ('logisticregression', LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [121]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.84      0.91      0.88       135
          1       0.84      0.74      0.79        88

avg / total       0.84      0.84      0.84       223



In [122]:
print('Test set accuracy score: ',accuracy_score(y_test,pred), '\nTrain set accuracy score: ',accuracy_score(grid.predict(X_train),y_train))

Test set accuracy score:  0.8430493273542601 
Train set accuracy score:  0.8143712574850299


### Conclusion

#### In the Titanic data set, I think that the best score to use would be accuracy since there aren't any general risks associated with incorrectly classifying someone either positively or negatively so getting the highest number of test cases correctly would be best. Using this criteria, among the models that I had created, using all of the available variables with an l1 penalty resulted in very similar results to using all available variables with an l2 penalty, and since using the l1 penalty means feature selecting is occurring I take this to mean that all of the features help us in making the model as accurate as possible. Using a grid search with PolynomialFeatures found that degree = 1 was the best fit.

### Overall, the best model based on accuracy scores was actually the first model created, seen below: using all variables after dropping the ones that would cause collinearity issues. Using a grid search, a C value of the default of 1 was the best fit and the accuracy in the test set was actually higher than that of the training set so there wasn't overfitting causing an issue.

In [176]:
X = dfwd.drop(['pclass_3','title_Uppity','title_Miss','embarked_Southampton','ticket','name','survived'],axis=1)
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)
params = {'logisticregression__C': [i for i in range(1,100)]}
pipe = make_pipeline(LogisticRegression(penalty='l2'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)
grid.fit(X_train,y_train)
grid.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [177]:
pred = grid.predict(X_test)
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.88      0.86      0.87       148
          1       0.74      0.77      0.76        75

avg / total       0.84      0.83      0.83       223



In [178]:
print('Test set accuracy score: ',accuracy_score(y_test,pred), '\nTrain set accuracy score: ',accuracy_score(grid.predict(X_train),y_train))

Test set accuracy score:  0.8340807174887892 
Train set accuracy score:  0.8233532934131736
