## I'll just be using the Titanic data set for my classification assignment!

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures

df = pd.read_csv('../data/titanic.csv')

In [2]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
survived    891 non-null int64
pclass      891 non-null int64
name        891 non-null object
sex         891 non-null object
age         714 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
ticket      891 non-null object
fare        891 non-null float64
cabin       204 non-null object
embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


### First things first, cleaning up the data a bit

In [4]:
df.groupby(['pclass','sex'])['age'].median()

pclass  sex   
1       female    35.0
        male      40.0
2       female    28.0
        male      30.0
3       female    21.5
        male      25.0
Name: age, dtype: float64

In [5]:
df.groupby(['pclass','sex'])['age'].mean()

pclass  sex   
1       female    34.611765
        male      41.281386
2       female    28.722973
        male      30.740707
3       female    21.750000
        male      26.507589
Name: age, dtype: float64

#### Median and mean age within different sex/class groups are pretty different so I'll replace nulls with the median value

In [6]:
df['age'] = df.groupby(['pclass','sex'])['age'].apply(lambda x: x.fillna(x.median()))

#### Cabin is null for the majority of data points so I will choose to disregard it and take it out of the data set

In [7]:
df = df.drop('cabin',axis=1)

#### Embarked has 2 null values so I'll just replace them with the most commonly occurring value

In [8]:
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [9]:
df.embarked = df.embarked.fillna('S')

In [10]:
df.embarked.value_counts()

S    646
C    168
Q     77
Name: embarked, dtype: int64

### I'm interested in creating a variable based on the title of the passenger

In [11]:
import re
names = df.name
title = []
for i in names:
    j = re.search(', (.*)\.', i)
    if j:
        title.append(j.group(0))

In [12]:
title1 = []
for i in title:
    j = i[2:]
    title1.append(j)
title2 = []
for i in title1:
    j = i[:-1]
    title2.append(j)

In [13]:
df['title'] = title2

In [14]:
df[df.title=='Mrs. Martin (Elizabeth L']

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,title
513,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54.0,1,0,PC 17603,59.4,C,Mrs. Martin (Elizabeth L


In [15]:
df.title.value_counts()

Mr                          517
Miss                        182
Mrs                         124
Master                       40
Dr                            7
Rev                           6
Col                           2
Mlle                          2
Major                         2
Ms                            1
Capt                          1
Lady                          1
Don                           1
Mme                           1
Sir                           1
Mrs. Martin (Elizabeth L      1
Jonkheer                      1
the Countess                  1
Name: title, dtype: int64

In [16]:
# I'm going to use my own judgment to group the rarer ones down further
df = df.replace({'title' : {'Master': 'Uppity','Dr':'Uppity','Rev':'Uppity','Mlle':'Uppity','Col':'Uppity'
                            ,'Major':'Uppity','Capt':'Uppity','Ms':'Miss','the Countess':'Uppity','Sir':'Uppity'
                            ,'Lady':'Uppity','Jonkheer':'Uppity','Mrs. Martin (Elizabeth L':'Mrs'
                            ,'Mme':'Uppity','Don':'Uppity'}})

In [17]:
df.title.value_counts()

Mr        517
Miss      183
Mrs       125
Uppity     66
Name: title, dtype: int64

In [18]:
df = df.replace({'sex':{'female':0,'male':1}
                ,'embarked':{'C':'Cherbourg','Q':'Queenstown','S':'Southampton'}})

In [19]:
df.groupby('pclass')['title'].value_counts()

pclass  title 
1       Mr        107
        Miss       46
        Mrs        42
        Uppity     21
2       Mr         91
        Mrs        41
        Miss       35
        Uppity     17
3       Mr        319
        Miss      102
        Mrs        42
        Uppity     28
Name: title, dtype: int64

In [20]:
df.groupby(['title','sex'])['survived'].value_counts()

title   sex  survived
Miss    0    1           128
             0            55
Mr      1    0           436
             1            81
Mrs     0    1            99
             0            26
Uppity  0    1             6
        1    0            32
             1            28
Name: survived, dtype: int64

#### Of the titles, it seems like uppity men survived at a much higher rate than normal men so I'll create a dummy variable that only looks at whether they were uppity people

In [21]:
u = []
t = df.title
for i in t:
    if i == 'Uppity':
        u.append(1)
    else:
        u.append(0)
df['uppity'] = u

### I'll create a family size variable

In [22]:
df['famsize'] = df['sibsp'] + df['parch']

### Women and children were prioritized to get off first (according to the movie) so I'll create a variable for children

In [23]:
c = []
a = df.age
for i in a:
    if i < 18:
        c.append(1)
    else:
        c.append(0)
df['child'] = c

In [24]:
df.groupby('child')['survived'].value_counts()
#it looks like being a child didn't really guarantee your survival....

child  survived
0      0           497
       1           281
1      1            61
       0            52
Name: survived, dtype: int64

### Let's explore the data  bit

In [25]:
df.groupby('embarked')['survived'].value_counts()

embarked     survived
Cherbourg    1            93
             0            75
Queenstown   0            47
             1            30
Southampton  0           427
             1           219
Name: survived, dtype: int64

In [26]:
df.groupby('famsize')['survived'].value_counts()

famsize  survived
0        0           374
         1           163
1        1            89
         0            72
2        1            59
         0            43
3        1            21
         0             8
4        0            12
         1             3
5        0            19
         1             3
6        0             8
         1             4
7        0             6
10       0             7
Name: survived, dtype: int64

In [27]:
df.survived.value_counts()
#This means that my classifier should do better than a score of 0.62
#since that's the result if you guess that no one survived

0    549
1    342
Name: survived, dtype: int64

In [28]:
df.groupby('pclass')['survived'].value_counts()

pclass  survived
1       1           136
        0            80
2       0            97
        1            87
3       0           372
        1           119
Name: survived, dtype: int64

### I'm ready to start trying to classify!

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
survived    891 non-null int64
pclass      891 non-null int64
name        891 non-null object
sex         891 non-null int64
age         891 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
ticket      891 non-null object
fare        891 non-null float64
embarked    891 non-null object
title       891 non-null object
uppity      891 non-null int64
famsize     891 non-null int64
child       891 non-null int64
dtypes: float64(2), int64(8), object(4)
memory usage: 97.5+ KB


In [30]:
#dummy variable time
dfwd = pd.get_dummies(df,columns = ['embarked','title','pclass'])

In [31]:
dfwd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 21 columns):
survived                891 non-null int64
name                    891 non-null object
sex                     891 non-null int64
age                     891 non-null float64
sibsp                   891 non-null int64
parch                   891 non-null int64
ticket                  891 non-null object
fare                    891 non-null float64
uppity                  891 non-null int64
famsize                 891 non-null int64
child                   891 non-null int64
embarked_Cherbourg      891 non-null uint8
embarked_Queenstown     891 non-null uint8
embarked_Southampton    891 non-null uint8
title_Miss              891 non-null uint8
title_Mr                891 non-null uint8
title_Mrs               891 non-null uint8
title_Uppity            891 non-null uint8
pclass_1                891 non-null uint8
pclass_2                891 non-null uint8
pclass_3                891 n

In [32]:
X = dfwd.drop(['pclass_3','title_Uppity','title_Miss','embarked_Southampton','ticket','name','survived'],axis=1)
y = dfwd.survived
X_train,X_test,y_train,y_test = train_test_split(X,y)

### First, a Dummy Classifier

In [33]:
from sklearn.dummy import DummyClassifier
dum = DummyClassifier()
dum.fit(X_train,y_train)
print(classification_report(y_test,dum.predict(X_test)))

             precision    recall  f1-score   support

          0       0.58      0.58      0.58       133
          1       0.38      0.38      0.38        90

avg / total       0.50      0.50      0.50       223



### Next, a KNeighborsClassifier

In [34]:
knc = KNeighborsClassifier()
knc.fit(X_train,y_train)
print(classification_report(y_test,knc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.73      0.86      0.79       133
          1       0.71      0.52      0.60        90

avg / total       0.72      0.72      0.71       223



#### Now with GridSearch for neighbors

In [35]:
kncparams = {'kneighborsclassifier__n_neighbors':[i for i in range (1,7)]}
kncpipe = make_pipeline(KNeighborsClassifier())
kncgrid = GridSearchCV(kncpipe,param_grid=kncparams,cv=5)
kncgrid.fit(X_train,y_train)
kncgrid.best_estimator_

Pipeline(memory=None,
     steps=[('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [36]:
print(classification_report(y_test,kncgrid.predict(X_test)))
#this is worse lol

             precision    recall  f1-score   support

          0       0.73      0.86      0.79       133
          1       0.71      0.52      0.60        90

avg / total       0.72      0.72      0.71       223



#### Adding a Scalar

In [37]:
kncpipes = make_pipeline(StandardScaler(), KNeighborsClassifier())
kncgrids = GridSearchCV(kncpipes,param_grid=kncparams,cv=5)
kncgrids.fit(X_train,y_train)
kncgrids.best_estimator_

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])

In [38]:
print(classification_report(y_test,kncgrids.predict(X_test)))
#scaling the data made it better!

             precision    recall  f1-score   support

          0       0.79      0.83      0.81       133
          1       0.73      0.68      0.70        90

avg / total       0.77      0.77      0.77       223



### Now a logistic regression

In [39]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
print(classification_report(y_test,lr.predict(X_test)))

             precision    recall  f1-score   support

          0       0.82      0.88      0.85       133
          1       0.80      0.72      0.76        90

avg / total       0.82      0.82      0.81       223



In [40]:
lr.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)>

In [41]:
params = {'logisticregression__C': [0.1,1,5,10,100]
         ,'logisticregression__penalty': ['l1','l2']}
lrpipe = make_pipeline(LogisticRegression())
lrgrid = GridSearchCV(lrpipe,param_grid=params,cv=5)

In [42]:
lrgrid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': [0.1, 1, 5, 10, 100], 'logisticregression__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
lrgrid.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [44]:
pred = lrgrid.predict(X_test)
print(classification_report(y_test,pred))
#using l1 as the penalty made it worse?

             precision    recall  f1-score   support

          0       0.82      0.88      0.85       133
          1       0.80      0.71      0.75        90

avg / total       0.81      0.81      0.81       223



In [45]:
print('Test set accuracy score: ',accuracy_score(y_test,pred), '\nTrain set accuracy score: ',accuracy_score(lrgrid.predict(X_train),y_train))

Test set accuracy score:  0.8116591928251121 
Train set accuracy score:  0.8278443113772455


#### With Polynomial Features

In [46]:
lrparams = {'polynomialfeatures__degree':[i for i in range (1,4)]
           ,'logisticregression__penalty':['l1','l2']
           ,'logisticregression__C':[.1,1,5,10]}
lrpipep = make_pipeline(PolynomialFeatures(),LogisticRegression())
lrgridp = GridSearchCV(lrpipep,param_grid = lrparams,cv=5)
lrgridp.fit(X_train,y_train)
lrgridp.best_estimator_

Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False)), ('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [47]:
print(classification_report(y_test,lrgridp.predict(X_test)))

             precision    recall  f1-score   support

          0       0.82      0.88      0.85       133
          1       0.80      0.71      0.75        90

avg / total       0.81      0.81      0.81       223



### This time I want to use the variables that my exploration and gut say would have an effect

In [48]:
dfwd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 21 columns):
survived                891 non-null int64
name                    891 non-null object
sex                     891 non-null int64
age                     891 non-null float64
sibsp                   891 non-null int64
parch                   891 non-null int64
ticket                  891 non-null object
fare                    891 non-null float64
uppity                  891 non-null int64
famsize                 891 non-null int64
child                   891 non-null int64
embarked_Cherbourg      891 non-null uint8
embarked_Queenstown     891 non-null uint8
embarked_Southampton    891 non-null uint8
title_Miss              891 non-null uint8
title_Mr                891 non-null uint8
title_Mrs               891 non-null uint8
title_Uppity            891 non-null uint8
pclass_1                891 non-null uint8
pclass_2                891 non-null uint8
pclass_3                891 n

In [49]:
X1 = dfwd[['sex','uppity','famsize','child','embarked_Cherbourg','pclass_1']]
y1 = dfwd.survived
X1_train,X1_test,y1_train,y1_test = train_test_split(X1,y1)

### KNeighbors

In [50]:
kncparams = {'kneighborsclassifier__n_neighbors':[i for i in range (1,7)]}
kncpipe = make_pipeline(KNeighborsClassifier())
kncgrid1 = GridSearchCV(kncpipe,param_grid=kncparams,cv=5)
kncgrid1.fit(X1_train,y1_train)
kncgrid1.best_estimator_

Pipeline(memory=None,
     steps=[('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [51]:
print(classification_report(y1_test,kncgrid1.predict(X1_test)))

             precision    recall  f1-score   support

          0       0.79      0.89      0.84       132
          1       0.81      0.65      0.72        91

avg / total       0.80      0.79      0.79       223



In [52]:
kncgrids.fit(X1_train,y1_train)
kncgrids.best_estimator_

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [53]:
print(classification_report(y1_test,kncgrids.predict(X1_test)))

             precision    recall  f1-score   support

          0       0.80      0.89      0.84       132
          1       0.81      0.67      0.73        91

avg / total       0.80      0.80      0.80       223



### Logistic

In [54]:
lrparams = {'logisticregression__C': [0.1,1,5,10,100]
         ,'logisticregression__penalty': ['l1','l2']}
lrpipe = make_pipeline(LogisticRegression())
lrgrid1 = GridSearchCV(lrpipe,param_grid=params,cv=5)

In [55]:
lrgrid1.fit(X1_train,y1_train)
lrgrid1.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [56]:
print(classification_report(y1_test,lrgrid1.predict(X1_test)))

             precision    recall  f1-score   support

          0       0.80      0.89      0.84       132
          1       0.80      0.67      0.73        91

avg / total       0.80      0.80      0.79       223



In [57]:
lrgridp.fit(X1_train,y1_train)
lrgridp.best_estimator_

Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)), ('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [58]:
print(classification_report(y1_test,lrgridp.predict(X1_test)))

             precision    recall  f1-score   support

          0       0.82      0.90      0.86       132
          1       0.83      0.70      0.76        91

avg / total       0.82      0.82      0.82       223



In [67]:
print('Test set accuracy score: ',accuracy_score(y1_test,lrgridp.predict(X1_test)), '\nTrain set accuracy score: ',accuracy_score(lrgridp.predict(X1_train),y1_train))

Test set accuracy score:  0.820627802690583 
Train set accuracy score:  0.8323353293413174


### Just playing around

In [59]:
params = {'logisticregression__C': [0.1,1,5,10,100]}
pipe = make_pipeline(LogisticRegression(penalty='l1'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)
grid.fit(X1_train,y1_train)
grid.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [60]:
pred = grid.predict(X1_test)
print(classification_report(y1_test,pred))

             precision    recall  f1-score   support

          0       0.80      0.89      0.84       132
          1       0.80      0.67      0.73        91

avg / total       0.80      0.80      0.79       223



In [61]:
params = {'logisticregression__C': [0.1,1,5,10,100]}
pipe = make_pipeline(LogisticRegression(penalty='l2'))
grid = GridSearchCV(pipe,param_grid=params,cv=5)
grid.fit(X1_train,y1_train)
grid.best_estimator_

Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [62]:
pred = grid.predict(X1_test)
print(classification_report(y1_test,pred))

             precision    recall  f1-score   support

          0       0.80      0.89      0.84       132
          1       0.80      0.67      0.73        91

avg / total       0.80      0.80      0.79       223



### Conclusion

#### In the Titanic data set, I think that the best score to use would be accuracy since there aren't any general risks associated with incorrectly classifying someone either positively or negatively so getting the highest number of test cases correctly would be best. Using this criteria, among the models that I had created, using all of the available variables with an l1 penalty resulted in very similar results to using all available variables with an l2 penalty, and since using the l1 penalty means feature selecting is occurring I take this to mean that all of the features help us in making the model as accurate as possible. Using a grid search with PolynomialFeatures found that degree = 1 was the best fit.

### Overall, the best model based on accuracy scores was actually the logistic regression model with polynomial features (degree=3) using the X with variables that I had selected myself. The accuracy in the training set was also not significantly higher than in the test set so it doesn't seem that there was an issue with overfitting.