In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import KFold
from sklearn.linear_model import logistic
from sklearn.metrics import accuracy_score



%matplotlib inline

## Load Training Data

In [40]:
train_data = pd.read_csv('data/train.csv')
pd.set_option('display.max_rows', 20)
display(train_data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Simple Analysis on the Training Data 

In [41]:
analysis = pd.DataFrame(columns=['col_name','null_num','type_num'])
for col in train_data:
    row = pd.Series({'col_name':col, 
                     'null_num':train_data[col].isnull().sum(), 
                     'type_num':train_data[col].unique().size})
    analysis = analysis.append(row, ignore_index=True)
    
display(analysis)

Unnamed: 0,col_name,null_num,type_num
0,PassengerId,0,891
1,Survived,0,2
2,Pclass,0,3
3,Name,0,891
4,Sex,0,2
5,Age,177,89
6,SibSp,0,7
7,Parch,0,7
8,Ticket,0,681
9,Fare,0,248


## Select and Convert Training Data

In [42]:
y_train = train_data['Survived']

'''
 drop 'Name', 'Ticket', 'Cabin'(too many missing), 'Embarked'(not related)
'''

x_train = train_data[['Pclass', 'SibSp', 'Parch']]


# Convert Sex feature from category into data 
x_train.insert(1, 'Sex', (train_data['Sex'].map({'female':0, 'male':1}).astype(int)))


# replace missing age value with median age
median_age = train_data['Age'].median()
print("Age Median:", median_age)
x_train.insert(2, 'Age',  train_data['Age'].fillna(median_age))


# replace missing fare value with median fare
median_fare = train_data['Fare'].median()
print("Fare Median:", median_fare)
x_train.insert(5, 'Fare',  train_data['Fare'].fillna(median_fare))

    
display(x_train)

Age Median: 28.0
Fare Median: 14.4542


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.2500
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.9250
3,1,0,35.0,1,0,53.1000
4,3,1,35.0,0,0,8.0500
5,3,1,28.0,0,0,8.4583
6,1,1,54.0,0,0,51.8625
7,3,1,2.0,3,1,21.0750
8,3,0,27.0,0,2,11.1333
9,2,0,14.0,1,0,30.0708


## 3-Fold Cross Validation

In [50]:
indices_3Fold = KFold(n_splits=3, shuffle=True, random_state=int(time.time())).split(train_data)
for train_indices, test_indices in indices_3Fold:
    test_x = x_train.iloc[test_indices]
    test_y = y_train.iloc[test_indices]
    train_x = x_train.iloc[train_indices]
    train_y = y_train.iloc[train_indices]
    logistic = logis.LogisticRegression()
    theta = logistic.fit(train_x, train_y)
    pred_y = logistic.predict(test_x)
    score = accuracy_score(test_y, pred_y, normalize=True)
    print("---------------------------------------------------------------------------")
    print("train size: %d, test size: %d" % (len(train_indices), len(test_indices)))
    print(theta)
    print("\nscore:", score)
print("---------------------------------------------------------------------------")

---------------------------------------------------------------------------
train size: 594, test size: 297
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

score: 0.8013468013468014
---------------------------------------------------------------------------
train size: 594, test size: 297
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

score: 0.7744107744107744
---------------------------------------------------------------------------
train size: 594, test size: 297
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          inter

## Load Test Data

In [44]:
test_data = pd.read_csv('data/test.csv')
pd.set_option('display.max_rows', 20)
display(test_data)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S


## Simple Analysis on the Test Data 

In [45]:
analysis = pd.DataFrame(columns=['col_name','null_num','type_num'])
for col in test_data:
    row = pd.Series({'col_name':col, 
                     'null_num':test_data[col].isnull().sum(), 
                     'type_num':test_data[col].unique().size})
    analysis = analysis.append(row, ignore_index=True)
    
display(analysis)

Unnamed: 0,col_name,null_num,type_num
0,PassengerId,0,418
1,Pclass,0,3
2,Name,0,418
3,Sex,0,2
4,Age,86,80
5,SibSp,0,7
6,Parch,0,8
7,Ticket,0,363
8,Fare,1,170
9,Cabin,327,77


## Select and Convert Test Data

In [46]:
x_test = test_data[['Pclass', 'SibSp', 'Parch']]


# Convert Sex feature from category into data 
x_test.insert(1, 'Sex', (test_data['Sex'].map({'female':0, 'male':1}).astype(int)))


# replace missing age value with median age
median_age = test_data['Age'].median()
print("Age Median:", median_age)
x_test.insert(2, 'Age',  test_data['Age'].fillna(median_age))


# replace missing fare value with median fare
median_fare = test_data['Fare'].median()
print("Fare Median:", median_fare)
x_test.insert(5, 'Fare',  test_data['Fare'].fillna(median_fare))

Age Median: 27.0
Fare Median: 14.4542


## Predict and Export Submission File

In [53]:
logistic = logis.LogisticRegression()
theta = logistic.fit(x_train, y_train)
pred_y = logistic.predict(x_test)

print(theta)

# Export to CSV
submit_df = pd.DataFrame(data={"PassengerId": test_data['PassengerId'], "Survived": pred_y} )
submit_df.to_csv("submission/submission_" + time.strftime('%m%d_%H%M%S') + ".csv", sep=",", index=False)
display(submit_df)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
