Data Dictionary
Variable	Definition	Key
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

**survival:    Survival
PassengerId: Unique Id of a passenger. 
pclass:    Ticket class     
sex:    Sex     
Age:    Age in years     
sibsp:    # of siblings / spouses aboard the Titanic     
parch:    # of parents / children aboard the Titanic     
ticket:    Ticket number     
fare:    Passenger fare     
cabin:    Cabin number     
embarked:    Port of Embarkation
titanic_df.describe()**


In [20]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
# from sklearn import linear_model
# from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn import tree
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier

In [21]:
titanic_df = pd.read_csv("clean4ML2.csv")


In [22]:
titanic_df.head(4)

Unnamed: 0,algo,titanic,space,flights,housing,quakes,cancer,stocks,useful
0,1.0,0.982036,1.0,0.999881,0.854642,0.854642,1.0,0.205761,1.0
1,2.0,0.744395,0.605505,0.999156,0.756146,0.754153,0.937062,0.086419,0.0
2,3.0,0.982036,1.0,0.999853,1.0,0.849285,1.0,0.205761,1.0
3,4.0,0.802691,0.714577,0.999818,1.0,0.845183,0.972027,0.092592,1.0


In [23]:
#titanic_df['useful'] = titanic_df['useful'].map({'female': 1, 'male': 0})
#titanic_df.head(4)

In [24]:
data = titanic_df#.drop(columns=['PassengerId', 'Name', "Ticket", "Embarked", "Cabin"])
data.sample(10)

Unnamed: 0,algo,titanic,space,flights,housing,quakes,cancer,stocks,useful
23,24.0,0.829341,0.3899083,0.0,0.0,0.0,0.939632,0.127572,1.0
18,19.0,0.0,0.0,0.9195876,0.01297626,0.012976,0.0,0.0,0.0
2,3.0,0.982036,1.0,0.9998532,1.0,0.849285,1.0,0.205761,1.0
22,23.0,0.780269,0.0733945,0.0,0.0,0.0,0.941489,0.074074,0.0
21,22.0,0.796407,0.07577302,0.0,0.0,0.0,0.939632,0.129629,1.0
17,18.0,0.0,0.0,0.0009214893,0.4757037,0.004504,0.0,0.0,0.0
28,29.0,0.0,0.0,0.0,0.0,0.0,-0.000192,0.823926,0.0
0,1.0,0.982036,1.0,0.9998814,0.8546421,0.854642,1.0,0.205761,1.0
8,9.0,0.838323,0.5485899,99.94,99.98,79.46,0.920187,0.0,1.0
19,20.0,-813247.3174,-2.27e+20,-8017784000.0,-11554890000.0,-458635.134613,-21731170.0,-342.63865,0.0


In [25]:
# data.info()

# Decision Tree Algorithm

In [26]:
target = data["useful"]
target_names = ["yes", "no"]

In [27]:
variable = data.drop("useful", axis=1)
feature_names = variable.columns
variable.head()

Unnamed: 0,algo,titanic,space,flights,housing,quakes,cancer,stocks
0,1.0,0.982036,1.0,0.999881,0.854642,0.854642,1.0,0.205761
1,2.0,0.744395,0.605505,0.999156,0.756146,0.754153,0.937062,0.086419
2,3.0,0.982036,1.0,0.999853,1.0,0.849285,1.0,0.205761
3,4.0,0.802691,0.714577,0.999818,1.0,0.845183,0.972027,0.092592
4,5.0,0.898204,1.0,0.999733,1.0,0.793485,0.969483,0.123456


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(variable, target, random_state=42)

In [29]:
# y_test is what the outcome is, and y_predict is what the decision tree predicts it. 
#y_test is the survive column entries, and X_test the rest of the columns or the 
# variables we think affect the survivie. 
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test) 

0.75

In [30]:
y_predict = clf.predict(X_test)

# Random forest Algorithm

In [31]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(variable, target)
rf.score(variable, target)
# Have not done the cross validation

1.0

In [32]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.04761467, 0.30004572, 0.18615533, 0.10080796, 0.03713053,
       0.03640811, 0.23566677, 0.0561709 ])

In [33]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.3000457226498973, 'titanic'),
 (0.23566676899932143, 'cancer'),
 (0.18615533164377612, 'space'),
 (0.10080796401402939, 'flights'),
 (0.05617090185453117, 'stocks'),
 (0.0476146733380912, 'algo'),
 (0.03713053030586393, 'housing'),
 (0.03640810719448957, 'quakes')]

# Support Vector Machine
With Radial-basis-function kernel

In [34]:
from sklearn.svm import SVC 
# using X_train, X_test, y_train, y_test
model = SVC(kernel='rbf')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
# compare y_predict to y_test



In [35]:
# if good prediction, what is y_predict - y_test = 0 

In [36]:
accuracy = model.score(X_test, y_test)
print(accuracy)

0.5


In [37]:
# How do I assess accuracy
model.score(X_train, y_train)

1.0

# Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [42]:
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6190476190476191
Testing Data Score: 0.625


In [46]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
27,0.0,0.0
16,1.0,0.0
12,1.0,0.0
22,1.0,0.0
8,1.0,1.0
9,1.0,1.0
21,1.0,1.0
0,1.0,1.0


# K Nearest Neighbor

In [45]:
# KNN 
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_train, y_train)  
Y_pred = knn.predict(X_test)  
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
print(acc_knn)

85.71
