In [51]:
%matplotlib inline

In [52]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats
matplotlib.style.use('ggplot')

In [56]:
import math

def scrub(filePath):
    data = pd.read_csv(filePath)
    char_cabin = data['Cabin'].astype(str)
    new_cabin = np.array([cabin[0] for cabin in char_cabin])
    data['Cabin'] = pd.Categorical(new_cabin)

    c1Median = data.Age[data.Pclass == 1].median()
    c2Median = data.Age[data.Pclass == 2].median()
    c3Median = data.Age[data.Pclass == 3].median()

    def medianFor(row):
        if (row['Pclass'] == 1):
            return c1Median
        elif (row['Pclass'] == 2):
            return c2Median
        elif (row['Pclass'] == 3):
            return c3Median
        else:
            raise Exception('Goofed')
    
    def updateAge(row):
        if (math.isnan(row['Age'])):
            median = medianFor(row)
            row['Age'] = median
        return row

    return data.apply(updateAge, axis=1)

    
titanic_train = scrub('train.csv')

titanic_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,n,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,24.0,1,2,W./C. 6607,23.45,n,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,n,Q


In [57]:
from sklearn import linear_model
from sklearn import preprocessing

In [58]:
label_encoder = preprocessing.LabelEncoder()

encoded_sex = label_encoder.fit_transform(titanic_train["Sex"])

log_model = linear_model.LogisticRegression()

log_model.fit( X = pd.DataFrame(encoded_sex)
             , y = titanic_train["Survived"])

print(log_model.intercept_)

print(log_model.coef_)

[ 1.00027876]
[[-2.43010712]]


In [59]:
# Make predictions
preds = log_model.predict_proba(X= pd.DataFrame(encoded_sex))
preds = pd.DataFrame(preds)
preds.columns = ["Death_prob", "Survival_prob"]

# Generate table of predictions vs Sex
pd.crosstab(titanic_train["Sex"], preds.ix[:, "Survival_prob"])

Survival_prob,0.193125428972,0.731113382332
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0,314
male,577,0


In [60]:
encoded_class = label_encoder.fit_transform(titanic_train["Pclass"])
encoded_cabin = label_encoder.fit_transform(titanic_train["Cabin"])

train_features = pd.DataFrame([ encoded_class
                              , encoded_cabin
                              , encoded_sex
                              , titanic_train["Age"]
                              ]).T

log_model = linear_model.LogisticRegression()

log_model.fit( X = train_features
             , y = titanic_train["Survived"])

print(log_model.intercept_)

print(log_model.coef_)

[ 3.41739189]
[[-0.96536638 -0.05974971 -2.42308968 -0.02864526]]


In [61]:
preds = log_model.predict(X = train_features)

confusion = pd.crosstab(preds, titanic_train["Survived"])
confusion

Survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,469,99
1,80,243


In [62]:
(confusion[0][0] + confusion[1][1]) / confusion.values.sum().astype(float)


0.7991021324354658

In [63]:
log_model.score( X = train_features
               , y = titanic_train["Survived"])

0.7991021324354658

In [64]:
from sklearn import metrics

In [65]:
metrics.confusion_matrix( y_pred=preds
                        , y_true=titanic_train["Survived"])

array([[469,  80],
       [ 99, 243]])

In [66]:
print(metrics.classification_report( y_true=titanic_train["Survived"]
                                    , y_pred=preds))

             precision    recall  f1-score   support

          0       0.83      0.85      0.84       549
          1       0.75      0.71      0.73       342

avg / total       0.80      0.80      0.80       891



In [67]:
titanic_test = scrub('test.csv')

                      , titanic_test['Age'])

titanic_test['Age'] = new_age_var

In [16]:
encoded_sex = label_encoder.fit_transform(titanic_test['Sex'])
encoded_class = label_encoder.fit_transform(titanic_test['Pclass'])
encoded_cabin = label_encoder.fit_transform(titanic_test['Cabin'])

test_features = pd.DataFrame([ encoded_class
                             , encoded_cabin
                             , encoded_sex
                             , titanic_test['Age']]).T

In [19]:
test_preds = log_model.predict(X=test_features)

submission = pd.DataFrame({ "PassengerId": titanic_test["PassengerId"]
                          , "Survived":test_preds})

submission.to_csv( "submission.csv"
                 , index=False)
