The first step to begin in a project is charge the libraries.

In [1]:
# This library works with DataFrames. A DataFrame treats the data as a table, like a sheet in Excel.
import pandas as pd
# This library has models that we will use. 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load the dataset
dataTrain = pd.read_csv("./input/train.csv")
dataTest = pd.read_csv("./input/test.csv")
# Combine both datasets
dataset = [dataTrain, dataTest]

In [3]:
# Show a summary of the data in Dataset
dataTrain.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Graham, Mr. George Edward",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


We are going to intepret this data. We have the columns of the table.
count = size of not null (not missing) in the fields
mean = average
std = standard desviation
min, 25%, 50%, 75%, max = minimum, 25%, 50%, 75%, maximum

In [4]:
# Show some random table rows with all columns
dataTrain.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
425,426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S
367,368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C
375,376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C


We choose the target of the problem and in Titanic's problem is find who survived and who not.

In [5]:
# Select single column
titanicSurvived = dataTrain.Survived
# Print this column with its column type
print(titanicSurvived.head())

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [6]:
# Select multiple columns
titanicSexAndName = ['Sex', 'Name']
twoColumns = dataTrain[titanicSexAndName]
# Print these columns with its columns types
twoColumns.head()

Unnamed: 0,Sex,Name
0,male,"Braund, Mr. Owen Harris"
1,female,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,female,"Heikkinen, Miss. Laina"
3,female,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,male,"Allen, Mr. William Henry"


We can see that Cabin column has a lot of missing values, so It is a irrelevant column, we are going to drop it

In [7]:
# Drop Cabin and Ticket columns
dataTrain = dataTrain.drop(['Ticket', 'Cabin'], axis=1)
dataTest = dataTest.drop(['Ticket', 'Cabin'], axis=1)
# We combine again both datasets
dataset = [dataTrain, dataTest]

We check that we dropped it successful

In [8]:
print(dataTrain.columns)

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Fare', u'Embarked'],
      dtype='object')


We are going to clean some rows like in column 'Name' because we have a lot of different titles

In [9]:
for each in dataset:
    each['Title'] = each.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# We see how we have a lot of different titles, we will change by others
pd.crosstab(dataTrain['Title'], dataTrain['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


We see how there are too much different titles

In [10]:
# We replace some titles in few titles
for each in dataset:
    each['Title'] = each['Title'].replace(['Lady','Capt', 'Countess', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    each['Title'] = each['Title'].replace('Mlle', 'Miss')
    each['Title'] = each['Title'].replace('Ms', 'Miss')
    each['Title'] = each['Title'].replace('Mme', 'Mrs')
# We calculate mean to see the titles
dataTrain[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Other,0.347826


We are going to change categorical information to cardinal information

In [11]:
title = {"Master":1, "Miss":2, "Mr":3, "Mrs":4, "Other":5}
for each in dataset:
    each['Title'] = each['Title'].map(title)
    each['Title'] = each['Title'].fillna(0)
dataTrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,4
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,4
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,3


We can drop Name, Age, Fare and PassengerId columns because they are not relevant. Further we will change all categorical data in numerical data

In [12]:
dataTrain = dataTrain.drop(['Name', 'PassengerId', 'Age', 'Fare'], axis=1)
dataTest = dataTest.drop(['Name', 'Age', 'Fare'], axis=1)
dataset = [dataTrain, dataTest]

In [13]:
for each in dataset:
    each['Sex'] = each['Sex'].map({'female':0, 'male':1}).astype(int)
dataTrain.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title
0,0,3,1,1,0,S,3
1,1,1,0,1,0,C,4
2,1,3,0,0,0,S,2
3,1,1,0,1,0,S,4
4,0,3,1,0,0,S,3


How Embarked column has two missing values, we are going to fill with the most common 

In [14]:
freqPort = dataTrain.Embarked.dropna().mode()[0]
freqPort

'S'

In [15]:
dataTest.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Embarked,Title
count,418.0,418.0,418.0,418.0,418.0,418,418.0
unique,,,,,,3,
top,,,,,,S,
freq,,,,,,270,
mean,1100.5,2.26555,0.636364,0.447368,0.392344,,2.911483
std,120.810458,0.841838,0.481622,0.89676,0.981429,,0.783102
min,892.0,1.0,0.0,0.0,0.0,,1.0
25%,996.25,1.0,0.0,0.0,0.0,,3.0
50%,1100.5,3.0,1.0,0.0,0.0,,3.0
75%,1204.75,3.0,1.0,1.0,0.0,,3.0


In [16]:
for each in dataset:
    each['Embarked'] = each['Embarked'].fillna('S')
    each['Embarked'] = each['Embarked'].map({'S':0, 'Q':1, 'C':2}).astype(int)
dataTrain.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title
0,0,3,1,1,0,0,3
1,1,1,0,1,0,2,4
2,1,3,0,0,0,0,2
3,1,1,0,1,0,0,4
4,0,3,1,0,0,0,3


In [17]:
# Parameters
x_dataTrain = dataTrain.drop('Survived', axis=1)
y_dataTrain = dataTrain["Survived"]
x_dataTest = dataTest.drop('PassengerId', axis=1).copy()

We have the criteria to create our model now. But we cannot fit the model because we have missing values, we need to fix it. 

We are going to choose some models, in this project the target is who survived in the accident

In [18]:
models = {
    'logReg': LogisticRegression(),
    'svc': SVC(),
    'knn': KNeighborsClassifier(n_neighbors = 3),
    'gauss': GaussianNB(),
    'sgd': SGDClassifier(),
    'tree': DecisionTreeClassifier(),
    'rf': RandomForestClassifier(n_estimators=100)
}

for id, cls in models.items():
    cls.fit(x_dataTrain, y_dataTrain)
    Y_predict = cls.predict(x_dataTest)
    acc_log = round(cls.score(x_dataTrain, y_dataTrain) * 100, 2)
    print(id, acc_log)

('knn', 84.29)
('tree', 85.52)
('svc', 83.61)
('gauss', 80.02)
('rf', 85.52)
('logReg', 79.57)
('sgd', 77.67)




In [20]:
# Stack submission
submission = pd.DataFrame({
        "PassengerId": dataTest["PassengerId"],
        "Survived": Y_predict
    })
submission.to_csv('./submission.csv', index=False)