### Jupyter notebook

A Python development environment


Installation guide: https://jupyter.org/install


### Essential ML libraries:

* scikit-learn
* pandas
* matplotlib
* numpy

Data for ML projects usually present in the form of tabular data and manipulated via Pandas dataframes

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree

from sklearn.model_selection import KFold 

%matplotlib inline

In [None]:
root = '/Users/schwalmdaniel/github/kaggle/titanic'
#root = 'd:/dev/python/kaggle/titanic'

train=pd.read_csv(root + "/train.csv")

### Explotatory Data Analysis

In [None]:
# first let's see what is the shape of the data (cols, rows)

train.shape

In [None]:
# what are the data types of the columns. Note that data types are inferred from the data and use 'fat' data types.

train.dtypes

In [None]:
# finally let's have a look at the data itself

train.head(10)

In [None]:
# we want to predict whether a given person survived the disaster or not

train['Survived'].value_counts() 

**Accuracy**: Percentage of correct predictions made by the model


The **Null Accuracy** for this prediction is 61%.

This means that by always predicting 'not survived' we can achieve the same accuracy without creating any models.
We have to be better than this.

In [None]:
# missing data seriously impacts models, let us check what data is missing

train.isnull().sum()

In [None]:
# for now drop Cabin due to lot of missing data

train = train.drop(['Cabin'],axis=1)

In [None]:
# check the Age feature
train.Age.describe()

In [None]:
# let us fill the missing Age data with random values between the mean +- standard deviation

mean = train["Age"].mean()
std = train["Age"].std()
    
train['Age'] = train['Age'].apply(lambda x: np.random.randint(mean - std, mean + std) if np.isnan(x) else x)

train.isnull().sum()

In [None]:
# check the Embarked field. Note how the describe() output looks for object or numeric types

train.Embarked.describe()

In [None]:
# convert the columns to be string

train['Embarked'].fillna('',inplace=True)
train['Embarked'] = train['Embarked'].astype(str)

# fill the missing data with the most frequent value in this case

train['Embarked'] = train['Embarked'].apply(lambda x: 'S' if not x or not x.strip() else x)

train.isnull().sum()

In [None]:
# We dealt with missing data now.

# Machines only understand numeric data so we have to convert all columns to numeric


# we can do it manually
train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 0)


# or with One Hot Encoding for categorical variables
train = pd.get_dummies(train, columns = ['Embarked','Parch'], prefix_sep='__')


In [None]:
# Passenger id does not hold any information for training, let's drop it
# Also, drop Name and Ticket information for now, they may carry meaningful information though

train = train.drop(['PassengerId','Name', 'Ticket'], axis=1)

In [None]:
# Now we have only numeric data we are ready for training

train.head(10)

In [None]:
# Our dataset is composed of a target variable (e.g. label) and a lot of features. 
# Usually with 'X' they refer to the features and with 'y' the target variable
# Let us split our training set according to this

X = train.drop(['Survived'], axis=1)
y = train['Survived']

In [None]:
# let's start to train the model by splitting the training set to training and validation set
# this way we can check how accurate is our model for previously unseen data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

In [None]:
print ('Training shape: %s, test shape: %s' % (X_train.shape, X_test.shape))

In [None]:
# let us pick the four most frequent models in their naive form, e.g. without any fine tuning 
# run it multiple times and see what happens

lr = LogisticRegression()
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

for model in [lr, knn, d_tree, forest]:
    model.fit(X_train, y_train)
    model.predict(X_test)
    print ('%s accuracy score: %f' % (model.__class__.__name__, model.score(X_test, y_test)))
    

In [None]:
#plt.figure(figsize=(20,8))
#tree.plot_tree(d_tree, fontsize=9, label='none', class_names=True, max_depth=5)
from sklearn.externals.six import StringIO  
import pydotplus
from IPython.display import Image
dot_data = StringIO() 
tree.export_graphviz(d_tree, out_file=dot_data, feature_names=X_train.columns,max_depth=4) 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
Image(graph.create_png())

In [None]:
# the accuracy was not bad but there was a variance in the accuracy scores
# to get a glimpse on the average accuracy let us do a K-Fold cross validation
# K-Fold mean

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=37)

for model in [lr, knn, d_tree, forest]:
       
    auc_buf = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        
        model.fit(X_train, y_train)
        model.predict(X_test)
        
        auc_buf.append(model.score(X_test, y_test))
        
    print ('%s mean accuracy score: %f' % (model.__class__.__name__, np.mean(auc_buf))) 
    

In [None]:
# let's pick the random forest classifier as our best model
# and visualize our results
# run this multiple times to see the randomness of the model

model_features = list(X.columns)
feature_importance_df = pd.DataFrame()


#print (str(len(X.index)))

for model in [forest]:
       
    auc_buf = []

    for fold_, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        conf_mat = confusion_matrix(y_test, predictions)
        #print(conf_mat)
        #print(str(len(X_train.index)))
        #print(str(len(X_test.index)))
        
        auc_buf.append(model.score(X_test, y_test))
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = model_features
        fold_importance_df["importance"] = model.feature_importances_
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
    print ('%s mean accuracy score: %f' % (model.__class__.__name__, np.mean(auc_buf))) 
    
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(8,8))
sns.barplot(x="importance",y="feature",data=best_features.sort_values(by="importance", ascending=False))
plt.title('Most Important Features (avg over folds)')
plt.tight_layout()



In [None]:
# print the correlation matrix

plt.figure(figsize=(14,10))
sns.heatmap(train.corr(),annot=True)

In [None]:
# print the confusion matrix for the last prediction

conf_mat = confusion_matrix(y_test, predictions)
sns.heatmap(conf_mat, annot=True, fmt=".0f")
plt.show()