In [1]:
import pandas as pd

# Load the data sets to variables based on purpose
training_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')

# Removed meaningless data Name, Ticket, and Cabin since it has several missing values
training_data = training_data.drop(['Name','PassengerId','Ticket','Cabin'], axis=1)

# Combine Siblings, parents, children, spouse, into one family group.
training_data['Fml'] = training_data['SibSp'] + training_data['Parch']

# Dropping columns that were merged
training_data = training_data.drop(['SibSp','Parch'], axis=1)

print(training_data.head(5))

   Survived  Pclass     Sex   Age     Fare Embarked  Fml
0         0       3    male  22.0   7.2500        S    1
1         1       1  female  38.0  71.2833        C    1
2         1       3  female  26.0   7.9250        S    0
3         1       1  female  35.0  53.1000        S    1
4         0       3    male  35.0   8.0500        S    0


In [2]:
print(training_data.isnull().sum())

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
Embarked      2
Fml           0
dtype: int64


In [3]:
print(training_data.Embarked.value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [4]:
# From embarked, only 2 missing, so fill the missing with most found data
training_data['Embarked'] = training_data['Embarked'].fillna('S')

# Fill missing age with median age
median_age = training_data['Age'].median()
training_data['Age'] = training_data['Age'].fillna(median_age) 

print(training_data.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
Fml         0
dtype: int64


In [5]:
# Convert Sex, and Embarked to numerical values
training_data['Sex'] = training_data['Sex'].map({'male':0, 'female':1})
training_data['Embarked'] = training_data['Embarked'].map({'S':0, 'C':1, 'Q':2})

print(training_data.head(5))

   Survived  Pclass  Sex   Age     Fare  Embarked  Fml
0         0       3    0  22.0   7.2500         0    1
1         1       1    1  38.0  71.2833         1    1
2         1       3    1  26.0   7.9250         0    0
3         1       1    1  35.0  53.1000         0    1
4         0       3    0  35.0   8.0500         0    0


In [6]:

# Normalize the data to fit between 0-1 floating point values only
for column in training_data.columns:
    training_data[column] = training_data[column]  / training_data[column].abs().max()
    
print(training_data.head(5))


   Survived    Pclass  Sex     Age      Fare  Embarked  Fml
0       0.0  1.000000  0.0  0.2750  0.014151       0.0  0.1
1       1.0  0.333333  1.0  0.4750  0.139136       0.5  0.1
2       1.0  1.000000  1.0  0.3250  0.015469       0.0  0.0
3       1.0  0.333333  1.0  0.4375  0.103644       0.0  0.1
4       0.0  1.000000  0.0  0.4375  0.015713       0.0  0.0


In [7]:
# Set the X and y value for features and groundtruth
X = training_data.drop(columns='Survived')
y = training_data.Survived

print(X.head(5))

     Pclass  Sex     Age      Fare  Embarked  Fml
0  1.000000  0.0  0.2750  0.014151       0.0  0.1
1  0.333333  1.0  0.4750  0.139136       0.5  0.1
2  1.000000  1.0  0.3250  0.015469       0.0  0.0
3  0.333333  1.0  0.4375  0.103644       0.0  0.1
4  1.000000  0.0  0.4375  0.015713       0.0  0.0


In [8]:
# Create and train decision tree model
from sklearn import tree
# DT = tree.DecisionTreeClassifier()
DT = tree.DecisionTreeClassifier(criterion="gini", max_depth=3)
DT_model = DT.fit(X, y)

In [9]:
#Graph DT model 
import graphviz

dot_data = tree.export_graphviz(DT_model, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("DT")


'DT.pdf'

In [10]:
# Check for missing values in test data
print(test_data.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [11]:
print(X.columns)

Index(['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Fml'], dtype='object')


In [12]:
# We can disregard cabin, but we must handle missing values for Age and Fare
median_age = test_data['Age'].median()
test_data['Age'] = test_data['Age'].fillna(median_age) 

# Fare values are distributed much like age, so use same method
median_fare = test_data['Fare'].median()
test_data['Fare'] = test_data['Fare'].fillna(median_fare)

# Sort by passanger ID
test_data.sort_values(by=['PassengerId'])

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,27.0,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,27.0,0,0,359309,8.0500,,S


In [13]:
print(test_data.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [14]:
# Normalize test data as we did with the training data
test_data['Fml'] = test_data['SibSp'] + test_data['Parch']
test_data['Sex'] = test_data['Sex'].map({'male':0, 'female':1})
test_data['Embarked'] = test_data['Embarked'].map({'S':0, 'C':1, 'Q':2})

features = list(X.columns)
X_test_data = test_data[features]

for column in X_test_data.columns:
     X_test_data[column] = X_test_data[column] / X_test_data[column].abs().max()

print(X_test_data.head(5))

     Pclass  Sex       Age      Fare  Embarked  Fml
0  1.000000  0.0  0.453947  0.015282       1.0  0.0
1  1.000000  1.0  0.618421  0.013663       0.0  0.1
2  0.666667  0.0  0.815789  0.018909       1.0  0.0
3  1.000000  0.0  0.355263  0.016908       0.0  0.0
4  1.000000  1.0  0.289474  0.023984       0.0  0.2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_data[column] = X_test_data[column] / X_test_data[column].abs().max()


In [15]:
predictions = DT_model.predict(X_test_data)

predictions

array([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [16]:
from sklearn.metrics import accuracy_score


accuracy_score(gender_submission.Survived, predictions)

0.9688995215311005

In [17]:
# Beginning five fold cross validation
from sklearn.model_selection import KFold, cross_val_score

five_fold = KFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_val_score(DT, X, y, scoring='accuracy', cv=five_fold, n_jobs=-1)
sum(scores) / len(scores)

0.8114556525014123

In [18]:
# Create random forest model
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(criterion='gini', min_samples_split=2, n_estimators=100)
scores = cross_val_score(RF, X, y, scoring='accuracy', cv=five_fold, n_jobs=-1)
sum(scores) / len(scores)

0.8215868432615656

In [19]:
scores

array([0.7877095 , 0.78651685, 0.82022472, 0.86516854, 0.84831461])

In [20]:
sum(scores) / len(scores)

0.8215868432615656