In [22]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Figures inline and set visualization style
%matplotlib inline
sns.set()

In [26]:
# Import data - Let's just use the training set
dataset = pd.read_csv('train.csv')
#df_test = pd.read_csv('test.csv')

In [27]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
dataset.shape #training set has target label (survived / not survived)

(891, 12)

In [29]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [30]:
# Store target variable in another variable
y_var = dataset.Survived
X_vars = dataset.drop(['Survived'], axis=1)
X_vars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


In [31]:
# check distribution, skewed or symmetrical --> imputation
#pd.DataFrame.hist(dataset, figsize = [15,15]);

In [35]:
# Since the age data is skewed, let's use median instead of mean for imputation

# Impute missing numerical variables
dataset['Age'] = dataset.Age.fillna(dataset.Age.median())
dataset['Fare'] = dataset.Fare.fillna(dataset.Fare.median())

# Check out info of data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [37]:
# pd.get_dummies is used to transform categorical data into binary data
# Here, we want to transform "male & female" into 2 separate columns
data_dummy1 = pd.get_dummies(dataset, columns=['Sex'])

# We only need Sex_female or Sex_male; we don't need both
data_dummy1 = pd.get_dummies(dataset, columns=['Sex'], drop_first=True)
data_dummy1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,1


In [38]:
# Choose features which you think are relevant to the decision making/prediction
# Exclude name, ticket number, cabin, embarkation port
data_5_vars = data_dummy1[['Sex_male', 'Fare', 'Age','Pclass', 'SibSp']]
data_5_vars.head()

Unnamed: 0,Sex_male,Fare,Age,Pclass,SibSp
0,1,7.25,22.0,3,1
1,0,71.2833,38.0,1,1
2,0,7.925,26.0,3,0
3,0,53.1,35.0,1,1
4,1,8.05,35.0,3,0


In [39]:
data_5_vars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
Sex_male    891 non-null uint8
Fare        891 non-null float64
Age         891 non-null float64
Pclass      891 non-null int64
SibSp       891 non-null int64
dtypes: float64(2), int64(2), uint8(1)
memory usage: 28.8 KB


In [43]:
# Assign feature matrix X, and target label y
X = data_5_vars.values
y = y_var.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [44]:
X.shape

(891, 5)

In [45]:
X_train.shape

(712, 5)

In [46]:
X_test.shape

(179, 5)

In [47]:
# Instantiate model and fit to data
clf = tree.DecisionTreeClassifier(max_depth=2)
clf.fit(X, y)
#tree.DecisionTreeClassifier?

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [48]:
# Make predictions and store in 'Survived' column of df_test
Y_pred = clf.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, Y_pred))

             precision    recall  f1-score   support

          0       0.74      0.95      0.83       110
          1       0.86      0.46      0.60        69

avg / total       0.79      0.77      0.74       179



In [49]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, Y_pred)

array([[105,   5],
       [ 37,  32]], dtype=int64)

In [52]:
# plot the decision tree
#!pip install graphviz
#import graphviz 
#dot_data = tree.export_graphviz(clf, out_file=None) 
#graph = graphviz.Source(dot_data) 
#graph.render("Titanic") 
#graph

In [53]:
prediction_output = np.hstack((X_test, Y_pred.reshape(len(X_test),1)))
prediction_output.shape

(179, 6)

In [58]:
prediction_output = pd.DataFrame(prediction_output, columns=['sex_male','Fare','Age','Pclass','SibSp','Survived'])
prediction_output.head()

Unnamed: 0,sex_male,Fare,Age,Pclass,SibSp,Survived
0,1.0,24.15,24.0,3.0,2.0,0.0
1,1.0,16.1,44.0,3.0,0.0,0.0
2,1.0,7.225,22.0,3.0,0.0,0.0
3,1.0,14.1083,41.0,3.0,2.0,0.0
4,0.0,15.5,28.0,3.0,1.0,0.0


In [None]:
# create another column to show the actual
# find a way to plot confusion matrix
