# Titanic Dataset Logistic Regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df_train = pd.read_csv('https://raw.githubusercontent.com/ggodreau/titanic/master/train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/ggodreau/titanic/master/test.csv')

## Groom Train Dataset

In [3]:
y = df_train.pop('Survived')
X = df_train

In [4]:
def clean_X(df):
    out = df.copy()
    # fill nulls in the age column
    out.fillna({
        'Age': df['Age'].median(),
        'Fare': df['Fare'].median()
    }, inplace=True)
    # just return the numeric columns
    out = out[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
    # check there are no nulls
    print(f'Number of nulls:\n{out.isnull().sum()}')
    return out

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    clean_X(X), y, test_size=0.33, random_state=42)

Number of nulls:
Pclass    0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64


In [6]:
clf = LogisticRegression(solver='lbfgs')

In [7]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
clf.score(X_test, y_test)

0.7254237288135593

### See the Predicted vs. Actual in a DataFrame

In [9]:
res = X_test.copy().reset_index().drop('index', axis=1)
res['Act'] = pd.Series(y_test.values)
res['Pred'] = pd.Series(clf.predict(X_test))
res.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Act,Pred
0,3,28.0,1,1,15.2458,1,0
1,2,31.0,0,0,10.5,0,0
2,3,20.0,0,0,7.925,0,0
3,2,6.0,0,1,33.0,1,1
4,3,14.0,1,0,11.2417,1,0
5,1,26.0,0,0,78.85,1,1
6,3,28.0,0,0,7.75,1,0
7,3,16.0,2,0,18.0,0,0
8,3,16.0,0,0,7.75,1,0
9,1,19.0,0,2,26.2833,1,1


#### Make predictions against our test set (we don't know the answers)

In [10]:
clf.predict(clean_X(df_test))

Number of nulls:
Pclass    0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,