In [411]:
#******************************************************************************#
# Kaggle Titanic Competition                                                   #
# AXA_GerhardPachl - gerhard.pachl@axa-winterthur.ch                           #
# Version: 1                                                                   #
# first try - looking on the data, taking some variables(features)             #
# which seem to have an influence on survival or not like                      #
# Pclass, Sex, Age, Embarked                                                   #
#******************************************************************************#

In [1753]:
# initializing needed packages
import os
import pandas as pd
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier

In [1754]:
os.chdir('C:/Users/C924016/MyPythons/CSV/')
# importing the titanic train and test data set as a panda data frame
train_df = pd.read_csv('train.csv',header=0)
test_df  = pd.read_csv('test.csv',header=0)

In [1755]:
# first look on characteristics of the train and test data
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [1756]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [1757]:
# variable age has missing data - getting number of missing values 
test_df['Age'].isnull().values.ravel().sum()

86

In [1758]:
train_df['Age'].isnull().values.ravel().sum()

177

In [1759]:
# train['Age'].hist()
# P.show()

In [1760]:
# evaluate the other non numerical variables
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [1761]:
# take a look on Embarked - get unique values of variable Embarked --> boarding area
tr_boarding_area = list(enumerate(np.unique(train_df['Embarked']))) 
ts_boarding_area = list(enumerate(np.unique(test_df['Embarked'])))

In [1762]:
tr_boarding_area

[(0, nan), (1, 'C'), (2, 'Q'), (3, 'S')]

In [1763]:
ts_boarding_area

[(0, 'C'), (1, 'Q'), (2, 'S')]

In [1764]:
# get number of categories
train_df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [1765]:
test_df.Embarked.value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [1766]:
# getting number of missing values from age
train_df['Embarked'].isnull().values.ravel().sum()

2

In [1767]:
# getting number of missing values from age
test_df['Embarked'].isnull().values.ravel().sum()

0

In [1768]:
# change categorial values tonumeric and missing value of train data set 
# S == 1, C == 2, Q == 3, NAN like S (majority) == 1

In [1769]:
# change nan of train data to value 1
train_df.loc[ (train_df.Embarked.isnull()), 'Embarked'] = 'S'

In [1770]:
train_df['Embarked'].isnull().values.ravel().sum()

0

In [1771]:
# # S == 1, C == 2, Q == 3
train_df['Embarked'] = train_df['Embarked'].map( {'S': 1, 'C': 2, 'Q': 3} )
test_df['Embarked'] = test_df['Embarked'].map( {'S': 1, 'C': 2, 'Q': 3} )

In [1772]:
train_df.Embarked.value_counts()

1    646
2    168
3     77
Name: Embarked, dtype: int64

In [1773]:
test_df.Embarked.value_counts()

1    270
2    102
3     46
Name: Embarked, dtype: int64

In [1774]:
test_df.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked         int64
dtype: object

In [1775]:
# gender: replace categorial with numeric values

In [1776]:
train_df['Sex'] = train_df['Sex'].map( {'female': 1, 'male': 2} ).astype(int)

In [1777]:
test_df['Sex'] = test_df['Sex'].map( {'female': 1, 'male': 2} ).astype(int)

In [1778]:
train_df.Sex.value_counts()

2    577
1    314
Name: Sex, dtype: int64

In [1779]:
test_df.Sex.value_counts()

2    266
1    152
Name: Sex, dtype: int64

In [1780]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,1.647587,29.699118,0.523008,0.381594,32.204208,1.361392
std,257.353842,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429,0.635673
min,1.0,0.0,1.0,1.0,0.42,0.0,0.0,0.0,1.0
25%,223.5,0.0,2.0,1.0,20.125,0.0,0.0,7.9104,1.0
50%,446.0,0.0,3.0,2.0,28.0,0.0,0.0,14.4542,1.0
75%,668.5,1.0,3.0,2.0,38.0,1.0,0.0,31.0,2.0
max,891.0,1.0,3.0,2.0,80.0,8.0,6.0,512.3292,3.0


In [1781]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0,418.0
mean,1100.5,2.26555,1.636364,30.27259,0.447368,0.392344,35.627188,1.464115
std,120.810458,0.841838,0.481622,14.181209,0.89676,0.981429,55.907576,0.685516
min,892.0,1.0,1.0,0.17,0.0,0.0,0.0,1.0
25%,996.25,1.0,1.0,21.0,0.0,0.0,7.8958,1.0
50%,1100.5,3.0,2.0,27.0,0.0,0.0,14.4542,1.0
75%,1204.75,3.0,2.0,39.0,1.0,0.0,31.5,2.0
max,1309.0,3.0,2.0,76.0,8.0,9.0,512.3292,3.0


In [1782]:
# replace missing values with mean of age column

In [1783]:
age_mean_tr = train_df['Age'].dropna().median()
print ('train mean age' , age_mean_tr)
age_mean_ts = test_df['Age'].dropna().median()
print ('test mean age' , age_mean_ts)

('train mean age', 28.0)
('test mean age', 27.0)


In [1784]:
fare_mean_ts = test_df['Fare'].dropna().median()

In [1785]:
train_df.loc[ (train_df.Age.isnull()), 'Age'] = age_mean_tr
test_df.loc[ (test_df.Age.isnull()), 'Age'] = age_mean_ts

In [1786]:
test_df.loc[ (test_df.Fare.isnull()), 'Fare'] = fare_mean_ts

In [1787]:
train_df['Fare'] = train_df['Fare'].astype(int)

In [1788]:
test_df['Fare'] = test_df['Fare'].astype(int)

In [1789]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,1.647587,29.361582,0.523008,0.381594,31.785634,1.361392
std,257.353842,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.70373,0.635673
min,1.0,0.0,1.0,1.0,0.42,0.0,0.0,0.0,1.0
25%,223.5,0.0,2.0,1.0,22.0,0.0,0.0,7.0,1.0
50%,446.0,0.0,3.0,2.0,28.0,0.0,0.0,14.0,1.0
75%,668.5,1.0,3.0,2.0,35.0,1.0,0.0,31.0,2.0
max,891.0,1.0,3.0,2.0,80.0,8.0,6.0,512.0,3.0


In [1790]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,1.636364,29.599282,0.447368,0.392344,35.133971,1.464115
std,120.810458,0.841838,0.481622,12.70377,0.89676,0.981429,55.855855,0.685516
min,892.0,1.0,1.0,0.17,0.0,0.0,0.0,1.0
25%,996.25,1.0,1.0,23.0,0.0,0.0,7.0,1.0
50%,1100.5,3.0,2.0,27.0,0.0,0.0,14.0,1.0
75%,1204.75,3.0,2.0,35.75,1.0,0.0,31.0,2.0
max,1309.0,3.0,2.0,76.0,8.0,9.0,512.0,3.0


In [1791]:
# delete all variables, we do not use in the model liek PassngerID, SibSp, ParchFare 
# also deleting fare, because we already use Pclass - which should be stong correlated to Fare

In [1792]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int32
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare             int32
Cabin           object
Embarked         int64
dtype: object

In [1793]:
test_df.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex              int32
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare             int32
Cabin           object
Embarked         int64
dtype: object

In [1794]:
# save Passenger IDs for later reuse 
Pids = test_df['PassengerId'].values

In [1795]:
train_df['Age'] = train_df['Age'].astype(int)

In [1796]:
train_df.dtypes

PassengerId     int64
Survived        int64
Pclass          int64
Name           object
Sex             int32
Age             int32
SibSp           int64
Parch           int64
Ticket         object
Fare            int32
Cabin          object
Embarked        int64
dtype: object

In [1797]:
test_df['Age'] = test_df['Age'].astype(int)

In [1798]:
test_df.dtypes

PassengerId     int64
Pclass          int64
Name           object
Sex             int32
Age             int32
SibSp           int64
Parch           int64
Ticket         object
Fare            int32
Cabin          object
Embarked        int64
dtype: object

In [1799]:
test_df.dtypes

PassengerId     int64
Pclass          int64
Name           object
Sex             int32
Age             int32
SibSp           int64
Parch           int64
Ticket         object
Fare            int32
Cabin          object
Embarked        int64
dtype: object

In [1800]:
# Delete some variables, which as first sight seems to be not so influential
# columns = ['PassengerId', 'Name','Ticket','Cabin','Fare','SibSp','Parch']
columns = ['PassengerId','Name','Ticket','Cabin']

In [1801]:
train_df = train_df.drop(columns,axis=1)

In [1802]:
train_df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,2,22,1,0,7,1
1,1,1,1,38,1,0,71,2
2,1,3,1,26,0,0,7,1
3,1,1,1,35,1,0,53,1
4,0,3,2,35,0,0,8,1


In [1803]:
test_df = test_df.drop(columns,axis=1)

In [1804]:
test_df.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,2,34,0,0,7,3
1,3,1,47,1,0,7,1
2,2,2,62,0,0,9,3
3,3,2,27,0,0,8,1
4,3,1,22,1,1,12,1


In [1805]:
# now we have 4 variables and numeric values for the first random forest classification and transform them into an array
train_ar = train_df.values
test_ar = test_df.values

In [1806]:
len(train_ar)

891

In [1807]:
len(test_ar)

418

In [1808]:
surv_list = train_df['Survived'].values

In [1809]:
# use RandomForestClassifier for prediction and make instance RanFor
# number of trees = 100
rfc = RandomForestClassifier(n_estimators=100, min_samples_leaf=1)

In [1810]:
# use train_data for decision trees 
rfc = rfc.fit(train_ar[0::,1::], train_ar[0::,0] )

In [1811]:
train_del = np.delete(train_ar,1,axis=1)

In [1812]:
rfc.score(train_del, surv_list, sample_weight=None)

0.81369248035914699

In [1749]:
# use result decison trees of training set for the test set and prediction
Result = rfc.predict(test_ar).astype(int)

In [1750]:
Result

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [1751]:
Titanic_Submission = open("Titanic_Submission.csv", "wb")

In [1752]:
open_file_object = csv.writer(Titanic_Submission)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(Pids, Result))
Titanic_Submission.close()