# Implementing a machine learning for Titanic case

*For data anlysis see Titanic notebook*
##### First trials
* Load Input Data
  * Clean & complete data
  * Split Training Data 
* Train 1 ML 
* Validate using Input Data

##### Several ML implementing
First three steps kepts
* Train several ML
* Validate and compare
* Select Best
* Predict
* Submit

In [34]:
import pandas as pd
import numpy as np
import math
import StringIO
import datetime
import pydot_ng
import shutil 
from sklearn import tree
from IPython.display import Image  
from sklearn.externals.six import StringIO
from sklearn import ensemble
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import neighbors
from sklearn.cross_validation import KFold

# Load Input Data

In [2]:
def LoadData(path):
    return pd.read_csv(path)
wholeInputData = LoadData('train.csv')
wholeInputData.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


## Clean & complete data

### Number of familly members
We add the number of familly member. It is dependent of two others variables but it wouldn't it be complicated for the ml to find this feature ?

In [3]:
def AddFamillyNbr(data):
    data['FamillyNbr']=data.Parch+data.SibSp
AddFamillyNbr(wholeInputData)

### Missing Age values
We fill the missing value for the age

In [4]:
def FillWithMedian(data, columnName):
    median = data[columnName].median()
    data[columnName]=data[columnName].fillna(median)
    return median
    
medianAge = FillWithMedian(wholeInputData,'Age')

### Is There other empty values

In [5]:
def CheckForNull(ser):
    return ser.isnull().any()
print('Does Age has NA : %s' % CheckForNull(wholeInputData.Age))
print('Does Fare has NA : %s' % CheckForNull(wholeInputData.Fare))
print('Does Sex has NA : %s' % CheckForNull(wholeInputData.Sex))
print('Does Clas has NA : %s' % CheckForNull(wholeInputData.Pclass))
print('Does FamillyNbr has NA : %s' % CheckForNull(wholeInputData.FamillyNbr))
print('Does Ticket has NA : %s' % CheckForNull(wholeInputData.Ticket))
print('Does Cabin has NA : %s' % CheckForNull(wholeInputData.Cabin))

Does Age has NA : False
Does Fare has NA : False
Does Sex has NA : False
Does Clas has NA : False
Does FamillyNbr has NA : False
Does Ticket has NA : False
Does Cabin has NA : True


#### Cabin
Cabin is not useful as it is partially empty and has unique values.
Lets analyze the data to see if the First letter could be computed. 

In [6]:
wholeInputData[['Pclass','Cabin']].head(10)

Unnamed: 0,Pclass,Cabin
0,3,
1,1,C85
2,3,
3,1,C123
4,3,
5,3,
6,1,E46
7,3,
8,3,
9,2,


Data won't be usable

### Normalize data

In [7]:
def getNormalizer(data):
    return preprocessing.StandardScaler(copy=True).fit(data)
normalizer = getNormalizer(wholeInputData[['Age','Fare']])

### Split Training Data

In [8]:
def SplitData(input, sizeValidate):
    validateSample = input.sample(n=sizeValidate, random_state =10)#to be able to repeat if necessary we force randomstate
    rest = input.drop(validateSample.index)    
    return (validateSample, rest)
    
validateData, trainData=SplitData(wholeInputData,50)
len(validateData.index)

50

In [9]:
len(trainData.index)

841

#### Save them in case of

In [10]:
validateData.to_csv('split_validate.csv')
trainData.to_csv('split_train.csv')

## Train a decision tree
The decision tree is the easiest to understand. That's why I choosed it as the first

*This requires no missing value. No other preparation is needed*
#### Tidy Dataset

In [11]:
trainData.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamillyNbr
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1


In [12]:
def TidyData(data):    
    result =  data.drop(['Cabin','Name','Ticket','Embarked','Sex','PassengerId'],axis=1)
    if('Survived' in data.columns):
        result = result.drop('Survived',axis=1)
    result['Genre'] = map(lambda x:1 if x=='male' else 0,data['Sex'])
    return result
tidyValidateData = TidyData(validateData)
tidyTrainData= TidyData(trainData)
survivedTrain = trainData['Survived']
survivedValidate = validateData['Survived']

In [13]:
tidyTrainData.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,FamillyNbr,Genre
0,3,22.0,1,0,7.25,1,1
2,3,26.0,0,0,7.925,0,0
3,1,35.0,1,0,53.1,1,0


In [14]:
survivedTrain.head(3)

0    0
2    1
3    1
Name: Survived, dtype: int64

In [15]:
classifierTree = tree.DecisionTreeClassifier()
classifierTree = classifierTree.fit(tidyTrainData,survivedTrain)

### Display the tree

In [16]:
def ExportGraphAsPng(treeToExport,featuresName,targetName,fileName):
    output = StringIO() 
    tree.export_graphviz(treeToExport, out_file=output,  
                         feature_names=featuresName,  
                         class_names=targetName,  
                         filled=True, rounded=True,  
                         special_characters=True)    
    graph = pydot_ng.graph_from_dot_data(output.getvalue()) 
    graph.write_png(fileName)         
targetLabels = ['Dead','Survived']
ExportGraphAsPng(classifierTree, tidyTrainData.columns,targetLabels,'FirstTree.png')


<img src="FirstTree.png" alt="First Tree"/>

We can see overfit.

Let's put a limit on the number of sample per class (several iteration were done : 5,10,20).

In [17]:
limitedTree = tree.DecisionTreeClassifier(min_samples_leaf =25)
limitedTree = limitedTree.fit(tidyTrainData,survivedTrain)
ExportGraphAsPng(limitedTree, tidyTrainData.columns,targetLabels,'SecondTree.png')

<img src="SecondTree.png" alt="First Tree"/>

A pruning would be very usefull :
* For female with a class 1 and 2, all survived
* For male almost all older then 13 died (those who were class as surving are almost equivalent numeber of dead)
Let's try with a max depth of 3

In [18]:
lengthTree = tree.DecisionTreeClassifier(max_depth =3)
lengthTree = lengthTree.fit(tidyTrainData,survivedTrain)
ExportGraphAsPng(lengthTree, tidyTrainData.columns,targetLabels,'ThirdTree.png')

<img src="ThirdTree.png" alt="First Tree"/>

### Evaluate Results

In [19]:
def EvaluateModel(tree, validationSet,expectedResults):
    predictions = tree.predict(validationSet)
    results = predictions==expectedResults
    return results
def Accuracy(results):
    nbrTrue = results[results==True].count()
    nbrTotal = results.count()    
    return nbrTrue/float(nbrTotal)*100.0
resltClassifierTree = EvaluateModel(classifierTree, tidyValidateData, survivedValidate)
resltLimitedTree = EvaluateModel(limitedTree, tidyValidateData, survivedValidate)
resltMengthTree = EvaluateModel(lengthTree, tidyValidateData, survivedValidate)
print ('Accuracy for Classifier Tree : %s' % Accuracy(resltClassifierTree))
print ('Accuracy for Classifier Tree with a minimal class size : %s' % Accuracy(resltLimitedTree))
print ('Accuracy for Classifier Tree with a fixed length : %s' % Accuracy(resltMengthTree))

Accuracy for Classifier Tree : 78.0
Accuracy for Classifier Tree with a minimal class size : 84.0
Accuracy for Classifier Tree with a fixed length : 82.0


The minimal class size apparently avoids an overfit and gives better results.
With an accuary of 82%.

# Train several ML
Possibles algorithm based on https://azure.microsoft.com/en-us/documentation/articles/machine-learning-algorithm-choice/
* decision forest / random forest
* logistic regression  
* decision jungle (NA for Python)
* boosted decision tree	
* neural network
* support vector machine


## Random forest tree

In [20]:
#Using recommended started values for parameters sqrt(7)--> 3 
#http://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees
rfcTrees = ensemble.RandomForestClassifier(n_estimators=10,max_features=3)
rfcTrees = rfcTrees.fit(tidyTrainData,survivedTrain)

In [21]:
resultrfcTrees = EvaluateModel(rfcTrees, tidyValidateData, survivedValidate)
print ('Accuracy for Random Forest Trees : %s' % Accuracy(resultrfcTrees))

Accuracy for Random Forest Trees : 80.0


## Prepare Data for specific machine learning

In [22]:
def applyNormalization(data, scaler):
    cData = data.copy()
    cData[['Age','Fare']] = scaler.transform(data[['Age','Fare']])
    cData[['C1','C2','C3']] = pd.get_dummies(data.Pclass)
    cData = cData.drop('Pclass',axis=1)
    return cData
ntTrainData = applyNormalization(tidyTrainData, normalizer)
ntValidateData = applyNormalization(tidyValidateData,normalizer)
# We don't one encode other elements as they are indeed ordered
ntTrainData.head(5)

Unnamed: 0,Age,SibSp,Parch,Fare,FamillyNbr,Genre,C1,C2,C3
0,-0.565736,1,0,-0.502445,1,1,0.0,0.0,1.0
2,-0.258337,0,0,-0.488854,0,0,0.0,0.0,1.0
3,0.433312,1,0,0.42073,1,0,1.0,0.0,0.0
4,0.433312,0,0,-0.486337,0,1,0.0,0.0,1.0
5,-0.104637,0,0,-0.478116,0,1,0.0,0.0,1.0


## Logistic Regression

In [23]:
logisticRegression = linear_model.LogisticRegression()
logisticRegression = logisticRegression.fit(ntTrainData,survivedTrain)
resultLogistic = EvaluateModel(logisticRegression, ntValidateData, survivedValidate)
print ('Accuracy for Random Forest Trees : %s' % Accuracy(resultLogistic))

Accuracy for Random Forest Trees : 84.0


### Display model coefficient

In [24]:
pd.DataFrame(zip(ntTrainData.columns, np.transpose(logisticRegression.coef_)))

Unnamed: 0,0,1
0,Age,[-0.509330649161]
1,SibSp,[-0.210784780927]
2,Parch,[0.0596357705783]
3,Fare,[0.184189847766]
4,FamillyNbr,[-0.151149010349]
5,Genre,[-2.57502423511]
6,C1,[1.35924795916]
7,C2,[0.467726080305]
8,C3,[-0.631026992638]


## Perceptron

In [25]:
perceptron = linear_model.Perceptron(penalty='l1') #Best found
perceptron = perceptron.fit(ntTrainData,survivedTrain)
resultperceptron = EvaluateModel(perceptron, ntValidateData, survivedValidate)
print ('Accuracy for Perceptron : %s' % Accuracy(resultperceptron))

Accuracy for Perceptron : 82.0


## Boosted decision tree

In [26]:
# Default values
boostTree = GradientBoostingClassifier(n_estimators=100, learning_rate=1,max_depth=1, random_state=0)
boostTree= boostTree.fit(tidyTrainData,survivedTrain)
resultBoostTree = EvaluateModel(boostTree, tidyValidateData, survivedValidate)
print ('Accuracy for Boosted Tree : %s' % Accuracy(resultBoostTree))
#Trials value
boostTree = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,max_depth=1, random_state=0)
boostTree= boostTree.fit(tidyTrainData,survivedTrain)
resultBoostTree = EvaluateModel(boostTree, tidyValidateData, survivedValidate)
print ('Accuracy for Boosted Tree tuned : %s' % Accuracy(resultBoostTree))

Accuracy for Boosted Tree : 84.0
Accuracy for Boosted Tree tuned : 86.0


## Neural network
Not yet available in the current version

```python
MLP = MLPClassifier(solver='lbgfs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
MLP = MLP.fit(ntTrainData,survivedTrain)
resultBoostTree = EvaluateModel(boostTree, ntValidateData, survivedValidate)
print ('Accuracy for Neural Network : %s' % Accuracy(resultBoostTree))
```

## Support vector machine
Tried several kernels (poly was too intensive) and linear had best results

In [27]:
SVMClass = svm.SVC(kernel='linear')
SVMClass= SVMClass.fit(tidyTrainData,survivedTrain)
resultSVM = EvaluateModel(SVMClass, tidyValidateData, survivedValidate)
print ('Accuracy for SVM : %s' % Accuracy(resultSVM))

Accuracy for SVM : 86.0


## K nearest neighbors

In [28]:
knei = neighbors.KNeighborsClassifier(n_neighbors=7,algorithm='auto')
knei= knei.fit(tidyTrainData,survivedTrain)
resultknei = EvaluateModel(knei, tidyValidateData, survivedValidate)
print ('Accuracy for SVM : %s' % Accuracy(resultknei))

Accuracy for SVM : 80.0


# Best algorithms
Two algorithms did as good : SVM & Boosted tree tuned with 86%
We can compute on the test data then send it on Kaggle
However some data is incomplete and could impact the prediction

In [29]:
testData = LoadData('test.csv')
AddFamillyNbr(testData)

print('Does Age has NA : %s' % CheckForNull(testData.Age))
print('Does Fare has NA : %s' % CheckForNull(testData.Fare))
print('Does Sex has NA : %s' % CheckForNull(testData.Sex))
print('Does Clas has NA : %s' % CheckForNull(testData.Pclass))
print('Does FamillyNbr has NA : %s' % CheckForNull(testData.FamillyNbr))
testData['Age']=testData['Age'].fillna(medianAge)
medianFare = wholeInputData.Fare.median()
testData['Fare']=testData['Fare'].fillna(medianFare)

tidyTestData = TidyData(testData)
normalizedTestData = applyNormalization(tidyTestData,normalizer)

Does Age has NA : True
Does Fare has NA : True
Does Sex has NA : False
Does Clas has NA : False
Does FamillyNbr has NA : False


In [30]:
testResultSVM = SVMClass.predict(tidyTestData)
testResultBoost = boostTree.predict(tidyTestData)
SVMPd = pd.DataFrame(testData.PassengerId)
SVMPd['Survived'] = testResultSVM
boostPd = pd.DataFrame(testData.PassengerId)
boostPd['Survived'] = testResultBoost
SVMPd.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0


In [31]:
boostPd.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0


In [32]:
SVMPd.to_csv(path_or_buf='SVMPredict.csv', columns=['PassengerId','Survived'], index=False,header=True)
boostPd.to_csv(path_or_buf='boostPredict.csv', columns=['PassengerId','Survived'], index=False,header=True)

<img src='Second submission.png' alt='Second submission'/>
No better then manually done for boost tree

<img src='ThirdSubmission.png' alt='Third Submission'/>
Third submission SVM is not really better either

## Second best
Using the second best simple approach : trees

In [33]:
testResultSVM = limitedTree.predict(tidyTestData)
treeTest = pd.DataFrame(testData.PassengerId)
treeTest['Survived'] = testResultSVM
treeTest.to_csv(path_or_buf='treePredict.csv', columns=['PassengerId','Survived'], index=False,header=True)

<img src='TreeSubmission.png' alt='Tree Submission'/>
Not an improvement either

# Cross validation
Using cross validation to improve estimatation of score
### Tree

In [78]:
def crossEstimate(data,result, model,folds):
    #Get the index splits
    nbrElements = len(data)
    kf = KFold(nbrElements, n_folds=folds)
    foldCpt = 0    
    scores = np.zeros(folds)
    #Iterate over the fold
    for train_idx, test_idx in kf:
        foldCpt+=1
        #print('Fold %s' % foldCpt)        
        ##Fit
        mdl = model()
        mdl = mdl.fit(data.iloc[train_idx],result.iloc[train_idx] )
        ##Compute the score
        mdlResults = EvaluateModel(mdl,data.iloc[test_idx], result.iloc[test_idx])
        scores[foldCpt-1] = Accuracy(mdlResults)
        #print('Score : %s' % scores[foldCpt-1])
    #Return the scores
    return scores
tidyWholeData = TidyData(wholeInputData)
scores = np.array(crossEstimate(tidyWholeData,wholeInputData.Survived,tree.DecisionTreeClassifier,5))
print ('Estimated score for tree %s (+- %s)'% (scores.mean(),round(scores.std(),2)))

Estimated score 77.558219823 (+- 2.48)


### Random Forest

In [82]:
scores = np.array(crossEstimate(tidyWholeData,wholeInputData.Survived,lambda:ensemble.RandomForestClassifier(n_estimators=10,max_features=3),5))
print ('Estimated score for random forest %s (+- %s)'% (scores.mean(),round(scores.std(),2)))

Estimated score for random forest 82.3827757203 (+- 3.21)


### Logistic Regression

In [81]:
normalizedWholeData=applyNormalization(tidyWholeData,normalizer)
mdl = lambda:linear_model.LogisticRegression()
scores = np.array(crossEstimate(normalizedWholeData,wholeInputData.Survived,mdl,5))
print ('Estimated score for logistic regression %s (+- %s)'% (scores.mean(),round(scores.std(),2)))

Estimated score for logistic regression 79.0119892034 (+- 3.31)


### Perceptron

In [89]:
mdl = lambda:linear_model.Perceptron(penalty='l1')
scores = np.array(crossEstimate(normalizedWholeData,wholeInputData.Survived,mdl,5))
print ('Estimated score for perceptron %s (+- %s)'% (scores.mean(),round(scores.std(),2)))

Estimated score for perceptron 76.5488669889 (+- 3.27)


### Boosted Decision Tree

In [91]:
mdl = lambda:GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,max_depth=1, random_state=0)
scores = np.array(crossEstimate(tidyWholeData,wholeInputData.Survived,mdl,5))
print ('Estimated score for boosted tree %s (+- %s)'% (scores.mean(),round(scores.std(),2)))

Estimated score for boosted tree 81.4838993158 (+- 2.25)


### Support Vector

In [93]:
mdl = lambda:svm.SVC(kernel='linear')
scores = np.array(crossEstimate(normalizedWholeData,wholeInputData.Survived,mdl,5))
print ('Estimated score for Support Vector %s (+- %s)'% (scores.mean(),round(scores.std(),2)))

Estimated score for Support Vector 78.7847592744 (+- 2.83)


### K nearest neighbors

In [94]:
mdl = lambda:neighbors.KNeighborsClassifier(n_neighbors=7,algorithm='auto')
scores = np.array(crossEstimate(normalizedWholeData,wholeInputData.Survived,mdl,5))
print ('Estimated score for Support Vector %s (+- %s)'% (scores.mean(),round(scores.std(),2)))

Estimated score for Support Vector 78.0057749043 (+- 2.36)
