In [None]:
# train-test split evaluation of models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# load the training and testing datadata
train_data = pd.read_csv("train.csv", usecols=["PassengerId","Pclass", "Ticket","Survived","Sex", "Cabin", "Age","SibSp","Parch","Fare","Embarked"])
test_data = pd.read_csv("test.csv", usecols=["PassengerId","Pclass", "Ticket", "Sex", "Cabin", "Age","SibSp","Parch","Fare","Embarked"])

In [None]:
# get a good look of the data
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,male,35.0,0,0,373450,8.05,,S


In [None]:
# change the sex column into number values
# train data
all_sexes = pd.read_csv("train.csv", usecols=["Sex"])
sexList = []
for i in range(len(train_data)):
    testing_row = all_sexes.loc[i]
    for sex in testing_row:
        if sex == "female":
            count = 0
        else:
            count = 1
    sexList.append(count)
train_data["Sex"] = sexList

# test data
all_sexes = pd.read_csv("test.csv", usecols=["Sex"])
sexList = []
for i in range(len(test_data)):
    testing_row = all_sexes.loc[i]
    for sex in testing_row:
        if sex == "female":
            count = 0
        else:
            count = 1
    sexList.append(count)
test_data["Sex"] = sexList

In [None]:
# change the embarked column into number values
# train data
all_embark = pd.read_csv("train.csv", usecols=["Embarked"])
embarkList = []
for i in range(len(train_data)):
    testing_row = all_embark.loc[i]
    for dest in testing_row:
        if dest == "S":
            count = 1
        elif dest == "C":
            count = 2
        else:
            count = 1
    embarkList.append(count)
train_data["Embarked"] = embarkList

# test data
all_embark = pd.read_csv("test.csv", usecols=["Embarked"])
embarkList = []
for i in range(len(test_data)):
    testing_row = all_embark.loc[i]
    for dest in testing_row:
        if dest == "S":
            count = 1
        elif dest == "C":
            count = 2
        else:
            count = 1
    embarkList.append(count)
test_data["Embarked"] = embarkList

In [None]:
# change the name column into numbers by converting the titles in passenger names to numbers
# train data
all_names = pd.read_csv("train.csv", usecols=["Name"])
nameList = []
for i in range(len(train_data)):
    testing_row = all_names.loc[i]
    for name in testing_row:
        if name.find("Master") >= 0:
            n = 4
        elif name.find("Don") >= 0:
            n = 1
        elif name.find("Dr") >= 0:
            n = 3
        elif name.find("Rev") >= 0:
            n = 2
        elif name.find("Lady") >= 0:
            n = 5
        elif name.find("Countess") >= 0:
            n = 6
        elif name.find("Capt") >= 0:
            n = 7
        elif name.find("Col") >= 0:
            n = 8
        elif name.find("Major") >= 0:
            n = 9
        elif name.find("Sir") >= 0:
            n = 10
        elif name.find("Jonkheer") >= 0:
            n = 11
        elif name.find("Mlle") >= 0:
            n = 12
        elif name.find("Ms") >= 0:
            n = 12
        elif name.find("Mme") >= 0:
            n = 14
        elif name.find("Miss") >= 0:
            n = 12
        elif name.find("Mrs") >= 0:
            n = 14
        elif name.find("Mr") >= 0:
            n = 15
        else:
            n = 0
    nameList.append(n)
train_data["Name"] = nameList

# test data
all_names = pd.read_csv("test.csv", usecols=["Name"])
nameList = []
for i in range(len(test_data)):
    testing_row = all_names.loc[i]
    for name in testing_row:
        if name.find("Master") >= 0:
            n = 4
        elif name.find("Don") >= 0:
            n = 1
        elif name.find("Dr") >= 0:
            n = 3
        elif name.find("Rev") >= 0:
            n = 2
        elif name.find("Lady") >= 0:
            n = 5
        elif name.find("Countess") >= 0:
            n = 6
        elif name.find("Capt") >= 0:
            n = 7
        elif name.find("Col") >= 0:
            n = 8
        elif name.find("Major") >= 0:
            n = 9
        elif name.find("Sir") >= 0:
            n = 10
        elif name.find("Jonkheer") >= 0:
            n = 11
        elif name.find("Mlle") >= 0:
            n = 12
        elif name.find("Ms") >= 0:
            n = 12
        elif name.find("Mme") >= 0:
            n = 14
        elif name.find("Miss") >= 0:
            n = 12
        elif name.find("Mrs") >= 0:
            n = 14
        elif name.find("Mr") >= 0:
            n = 15
        else:
            n = 0
    nameList.append(n)
test_data["Name"] = nameList

In [None]:
# find the percentage of null values for each column with nulls
# age
# train
numerator = train_data["Age"].isnull().sum()
denominator = train_data["Age"].isnull().count()
agepercent = ((numerator / denominator) * 100)
print(agepercent)
# test
numerator = test_data["Age"].isnull().sum()
denominator = test_data["Age"].isnull().count()
agepercent = ((numerator / denominator) * 100)
print(agepercent)

# cabin
# train
numerator = train_data["Cabin"].isnull().sum()
denominator = train_data["Cabin"].isnull().count()
cabinpercent = ((numerator / denominator) * 100)
print(cabinpercent)
# test
numerator = test_data["Cabin"].isnull().sum()
denominator = test_data["Cabin"].isnull().count()
cabinpercent = ((numerator / denominator) * 100)
print(cabinpercent)

# embarked
# train
numerator = train_data["Embarked"].isnull().sum()
denominator = train_data["Embarked"].isnull().count()
embarkedpercent = ((numerator / denominator) * 100)
print(embarkedpercent)
# test
numerator = test_data["Embarked"].isnull().sum()
denominator = test_data["Embarked"].isnull().count()
embarkedpercent = ((numerator / denominator) * 100)
print(embarkedpercent)

# fare
# train
numerator = train_data["Fare"].isnull().sum()
denominator = train_data["Fare"].isnull().count()
farepercent = ((numerator / denominator) * 100)
print(farepercent)
# test
numerator = test_data["Fare"].isnull().sum()
denominator = test_data["Fare"].isnull().count()
farepercent = ((numerator / denominator) * 100)
print(farepercent)

19.865319865319865
20.574162679425836
77.10437710437711
78.22966507177034
0.0
0.0
0.0
0.23923444976076555


In [None]:
# fill in the missing values
# age
train_data["Age"].fillna(train_data["Age"].mean(), inplace = True)
test_data["Age"].fillna(test_data["Age"].mean(), inplace = True)
# fare
train_data["Fare"].fillna(train_data["Fare"].mean(), inplace = True)
test_data["Fare"].fillna(test_data["Fare"].mean(), inplace = True)

In [None]:
# change the age column into smaller numbers by converting into age ranges
# train data
all_ages = pd.read_csv("train.csv", usecols=["Age"])
ageList = []
for i in range(len(train_data)):
    testing_row = all_ages.loc[i]
    for age in testing_row:
        if age < 13:
            count = 1
        elif age > 12 and age < 21:
            count = 2
        elif age > 21 and age < 45:
            count = 3
        else:
            count = 4
    ageList.append(count)
train_data["AgeRange"] = ageList

# test data
all_ages = pd.read_csv("test.csv", usecols=["Age"])
ageList = []
for i in range(len(test_data)):
    testing_row = all_ages.loc[i]
    for age in testing_row:
        if age < 13:
            count = 1
        elif age > 12 and age < 21:
            count = 2
        elif age > 21 and age < 45:
            count = 3
        else:
            count = 4
    ageList.append(count)
test_data["AgeRange"] = ageList

In [None]:
# convert cabin to more categorical feature
train_data["Deck"] = train_data["Cabin"].apply(lambda x: x[0] if pd.notnull(x) else "M")
test_data["Deck"] = test_data["Cabin"].apply(lambda x: x[0] if pd.notnull(x) else "M")

# now drop the irrelevant cabin feature
train_data = train_data.drop("Cabin", axis = 1)
test_data = test_data.drop("Cabin", axis = 1)

# drop the now irrelevant age columns
train_data = train_data.drop("Age", axis = 1)
test_data = test_data.drop("Age", axis = 1)

In [None]:
# convert deck column into numbers for later use
# train data
decks = train_data["Deck"]
deckList = []
for i in range(len(train_data)):
    testing_row = decks.loc[i]
    for deck in testing_row:
        if deck == "A":
            count = 1
        elif deck == "B":
            count = 2
        elif deck == "C":
            count = 3
        elif deck == "D":
            count = 4
        elif deck == "E":
            count = 5
        elif deck == "F":
            count = 6
        elif deck == "G":
            count = 7
        else:
            count = 8
    deckList.append(count)
train_data = train_data.drop("Deck", axis = 1)
train_data["Deck"] = deckList

# test data
decks = test_data["Deck"]
deckList = []
for i in range(len(test_data)):
    testing_row = decks.loc[i]
    for deck in testing_row:
        if deck == "A":
            count = 1
        elif deck == "B":
            count = 2
        elif deck == "C":
            count = 3
        elif deck == "D":
            count = 4
        elif deck == "E":
            count = 5
        elif deck == "F":
            count = 6
        elif deck == "G":
            count = 7
        else:
            count = 8
    deckList.append(count)
test_data = test_data.drop("Deck", axis = 1)
test_data["Deck"] = deckList

In [None]:
# create a new feature to see how large a family is
train_data["totalRelatives"] = train_data["SibSp"] + train_data["Parch"]
test_data["totalRelatives"] = test_data["SibSp"] + test_data["Parch"]

# analyze the results of this
train_data["totalRelatives"].describe()

count    891.000000
mean       0.904602
std        1.613459
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max       10.000000
Name: totalRelatives, dtype: float64

In [None]:
# as most families do not have 11 people (10 relatives), further divide into categories
all_families = train_data["totalRelatives"]
familyList = []
for i in range(len(train_data)):
    if all_families[i] == 0:
        count = 0
    elif all_families[i] > 0 and all_families[i] < 4:
        count = 1
    elif all_families[i] > 3 and all_families[i] < 6:
        count = 2
    else:
        count = 3
    familyList.append(count)
train_data["SizeOfFamily"] = familyList

# test data
all_families = test_data["totalRelatives"]
familyList = []
for i in range(len(test_data)):
    if all_families[i] == 0:
        count = 0
    elif all_families[i] > 0 and all_families[i] < 4:
        count = 1
    elif all_families[i] > 3 and all_families[i] < 6:
        count = 2
    else:
        count = 3
    familyList.append(count)
test_data["SizeOfFamily"] = familyList

# drop the totalRelatives column in favor for the sizeOfFamily
train_data = train_data.drop("totalRelatives", axis = 1)
test_data = test_data.drop("totalRelatives", axis = 1)

In [None]:
# create a new column to see if passengers are alone or not
# train data
famList = []
for i in range(len(train_data["SizeOfFamily"])):
    if train_data["SizeOfFamily"].loc[i] == 0:
        val = 1
    else:
        val = 0
    famList.append(val)
train_data["Alone"] = famList

# test data
famList = []
for i in test_data["SizeOfFamily"]:
    if test_data["SizeOfFamily"].loc[i] == 0:
        val = 1
    else:
        val = 0
    famList.append(val)
test_data["Alone"] = famList

In [None]:
# group the tickets by how often they occur for new column
# training data
train_data["OccurenceofTickets"] = train_data.groupby("Ticket")["Ticket"].transform("count")

# training data
test_data["OccurenceofTickets"] = test_data.groupby("Ticket")["Ticket"].transform("count")

# now drop the original ticket columns
train_data = train_data.drop("Ticket", axis = 1)
test_data = test_data.drop("Ticket", axis = 1)

In [None]:
# check the current state of the data
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Name,AgeRange,Deck,SizeOfFamily,Alone,OccurenceofTickets
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.647587,0.523008,0.381594,32.204208,1.188552,13.340067,3.075196,6.945006,0.494949,0.602694,1.787879
std,257.353842,0.486592,0.836071,0.47799,1.102743,0.806057,49.693429,0.391372,3.006353,0.885566,2.0615,0.708478,0.489615,1.361142
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
25%,223.5,0.0,2.0,0.0,0.0,0.0,7.9104,1.0,12.0,3.0,8.0,0.0,0.0,1.0
50%,446.0,0.0,3.0,1.0,0.0,0.0,14.4542,1.0,15.0,3.0,8.0,0.0,1.0,1.0
75%,668.5,1.0,3.0,1.0,1.0,0.0,31.0,1.0,15.0,4.0,8.0,1.0,1.0,2.0
max,891.0,1.0,3.0,1.0,8.0,6.0,512.3292,2.0,15.0,4.0,8.0,3.0,1.0,7.0


In [None]:
# since the data has mostly 12, 14, and 15 for its name values, keep those and group others together
# train data
all_names = pd.read_csv("train.csv", usecols=["Name"])
nameList = []
for i in range(len(train_data)):
    testing_row = all_names.loc[i]
    for name in testing_row:
        if name.find("Master") >= 0:
            n = 4
        elif name.find("Don") >= 0:
            n = 5
        elif name.find("Dr") >= 0:
            n = 5
        elif name.find("Rev") >= 0:
            n = 5
        elif name.find("Lady") >= 0:
            n = 5
        elif name.find("Countess") >= 0:
            n = 5
        elif name.find("Capt") >= 0:
            n = 5
        elif name.find("Col") >= 0:
            n = 5
        elif name.find("Major") >= 0:
            n = 5
        elif name.find("Sir") >= 0:
            n = 5
        elif name.find("Jonkheer") >= 0:
            n = 5
        elif name.find("Mlle") >= 0:
            n = 2
        elif name.find("Ms") >= 0:
            n = 2
        elif name.find("Mme") >= 0:
            n = 3
        elif name.find("Miss") >= 0:
            n = 2
        elif name.find("Mrs") >= 0:
            n = 3
        elif name.find("Mr") >= 0:
            n = 1
        else:
            n = 0
    nameList.append(n)
train_data["Name"] = nameList

# test data
all_names = pd.read_csv("test.csv", usecols=["Name"])
nameList = []
for i in range(len(test_data)):
    testing_row = all_names.loc[i]
    for name in testing_row:
        if name.find("Master") >= 0:
            n = 4
        elif name.find("Don") >= 0:
            n = 5
        elif name.find("Dr") >= 0:
            n = 5
        elif name.find("Rev") >= 0:
            n = 5
        elif name.find("Lady") >= 0:
            n = 5
        elif name.find("Countess") >= 0:
            n = 5
        elif name.find("Capt") >= 0:
            n = 5
        elif name.find("Col") >= 0:
            n = 5
        elif name.find("Major") >= 0:
            n = 5
        elif name.find("Sir") >= 0:
            n = 5
        elif name.find("Jonkheer") >= 0:
            n = 5
        elif name.find("Mlle") >= 0:
            n = 2
        elif name.find("Ms") >= 0:
            n = 2
        elif name.find("Mme") >= 0:
            n = 3
        elif name.find("Miss") >= 0:
            n = 2
        elif name.find("Mrs") >= 0:
            n = 3
        elif name.find("Mr") >= 0:
            n = 1
        else:
            n = 0
    nameList.append(n)
test_data["Name"] = nameList

In [None]:
# check the current state of the data (again) for changes
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Name,AgeRange,Deck,SizeOfFamily,Alone,OccurenceofTickets
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.647587,0.523008,0.381594,32.204208,1.188552,1.789001,3.075196,6.945006,0.494949,0.602694,1.787879
std,257.353842,0.486592,0.836071,0.47799,1.102743,0.806057,49.693429,0.391372,1.107854,0.885566,2.0615,0.708478,0.489615,1.361142
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
25%,223.5,0.0,2.0,0.0,0.0,0.0,7.9104,1.0,1.0,3.0,8.0,0.0,0.0,1.0
50%,446.0,0.0,3.0,1.0,0.0,0.0,14.4542,1.0,1.0,3.0,8.0,0.0,1.0,1.0
75%,668.5,1.0,3.0,1.0,1.0,0.0,31.0,1.0,2.0,4.0,8.0,1.0,1.0,2.0
max,891.0,1.0,3.0,1.0,8.0,6.0,512.3292,2.0,5.0,4.0,8.0,3.0,1.0,7.0


In [None]:
# remove any lingering columns made irrelevant by feature engineering
# column list
droppingcolumns = ["Parch", "SibSp"]
train_data = train_data.drop(droppingcolumns, axis = 1)
test_data = test_data.drop(droppingcolumns, axis = 1)

In [None]:
# now build the new model
# splitting data into X and y
y_train = train_data["Survived"]

X_train = train_data.drop("Survived", axis=1)
X_test = test_data

In [None]:
# get the best parameters for our model
# instantiate Random Forest model
forest = RandomForestClassifier()

# create parameters for the random forest to test 
forest_parameters = dict(max_depth = [x for x in range(5, 20)], min_samples_split = [x for x in range(2, 20)], 
    min_samples_leaf = [x for x in range(2, 10)], n_estimators = [x for x in range(50, 120, 50)],)

# choose and setup the random forest as the model
forest_model = RandomForestClassifier()
forest = GridSearchCV(estimator = forest, param_grid = forest_parameters, cv = 5) 
forest.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# gather which model gives the best score and what the best parameters are
print("Random forest best score: {}".format(forest.best_score_))
print("Random forest best parameters: {}".format(forest.best_estimator_))

AttributeError: 'RandomForestClassifier' object has no attribute 'best_score_'

In [None]:
# use output from before so we don't have to continuously gather the best parameters when rerunning the code
forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=60,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=60,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# make predictions for test data
y_pred = forest.predict(X_test)

In [None]:
# storing result in csv file
test_data["Survived"] = y_pred
result = test_data[["PassengerId","Survived"]]
result.to_csv("Submission.csv", index=False)