In [1]:
'''
LIN 373 UTexas:: Jessy Li
Original material from Byron Wallace
'''
import numpy as np 

import sklearn 
from sklearn import tree # decision trees!
from sklearn.model_selection import train_test_split

import pandas as pd

In [2]:
'''
Credit goes to Byron Wallace and https://github.com/savarin/pyconuk-introtutorial
'''

titanic_data = pd.read_csv("titanic.processed.csv")
titanic_data.head()

Unnamed: 0.1,Unnamed: 0,Class,Sex,Age,Survived
0,1,1st,Male,Child,No
1,2,2nd,Male,Child,No
2,3,3rd,Male,Child,No
3,3,3rd,Male,Child,No
4,3,3rd,Male,Child,No


In [3]:
titanic_data.head(15)

Unnamed: 0.1,Unnamed: 0,Class,Sex,Age,Survived
0,1,1st,Male,Child,No
1,2,2nd,Male,Child,No
2,3,3rd,Male,Child,No
3,3,3rd,Male,Child,No
4,3,3rd,Male,Child,No
5,3,3rd,Male,Child,No
6,3,3rd,Male,Child,No
7,3,3rd,Male,Child,No
8,3,3rd,Male,Child,No
9,3,3rd,Male,Child,No


In [4]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2209 entries, 0 to 2208
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2209 non-null   int64 
 1   Class       2209 non-null   object
 2   Sex         2209 non-null   object
 3   Age         2209 non-null   object
 4   Survived    2209 non-null   object
dtypes: int64(1), object(4)
memory usage: 86.4+ KB


In [5]:
titanic_data['Class'].unique()

array(['1st', '2nd', '3rd', 'Crew'], dtype=object)

In [6]:
titanic_data['Sex'].unique()

array(['Male', 'Female'], dtype=object)

In [7]:
titanic_data['Age'].unique()

array(['Child', 'Adult'], dtype=object)

In [8]:
titanic_data['Survived'].unique()

array(['No', 'Yes'], dtype=object)

In [None]:
# we need to convert this into features
titanic_data.dtypes

In [5]:
class_indicators = pd.get_dummies(titanic_data["Class"])
class_indicators

Unnamed: 0,1st,2nd,3rd,Crew
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
2204,0,0,0,1
2205,0,0,0,1
2206,0,0,0,1
2207,0,0,0,1


In [7]:
class_indicators = pd.get_dummies(titanic_data["Sex"])
class_indicators

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
2204,1,0
2205,1,0
2206,1,0
2207,1,0


In [13]:
# need to encode male / female as numbers...
sex_indicators = pd.get_dummies(titanic_data["Sex"])["Male"]
#print(sex_indicators)
age_indicators = pd.get_dummies(titanic_data["Age"])["Adult"]
#print(age_indicators)
class_indicators = pd.get_dummies(titanic_data["Class"])
#print(class_indicators)

# and we will (almost?) always assume predictors ('features') are denoted by X.
X_df = pd.DataFrame({"Sex":sex_indicators,
                    "Age":age_indicators,
                    "Class_1st":class_indicators["1st"],
                    "Class_2nd":class_indicators["2nd"],
                    "Class_3rd":class_indicators["3rd"]})
X_df.head()

Unnamed: 0,Sex,Age,Class_1st,Class_2nd,Class_3rd
0,1,0,1,0,0
1,1,0,0,1,0
2,1,0,0,0,1
3,1,0,0,0,1
4,1,0,0,0,1


In [16]:
# y is the convention for labels
y = titanic_data["Survived"]
y = np.array([{"Yes":1, "No":0}[y] for y in y.tolist()])
y

array([0, 0, 0, ..., 1, 1, 1])

In [17]:
# Build a decision tree classifier!
clf = tree.DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(sklearn.metrics.accuracy_score(preds, y_test))

0.8190045248868778


In [None]:
with open("dt.dot", 'w') as f:
    tree.export_graphviz(clf, f, feature_names = X_df.columns.values.tolist())

## Random Forest

A demo using the titanic dataset. Note however: this dataset doesn't have as many features, so the power of random forest cannot be exploited fully. Try on a dataset with many features and you will see the difference!

In [22]:
#Build a random forest classifier!
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators = 10, max_depth = 10, random_state = 0)
clf_rf.fit(X_train,y_train)
preds = clf_rf.predict(X_test)
print(sklearn.metrics.accuracy_score(preds, y_test))

0.7918552036199095
