In [27]:
# import all the libraries

# Data set
from sklearn import datasets

# Data exploration and analysis tools
import pandas as pd
import seaborn as sns
import numpy as np

# Data cleaning for machine learning models
from sklearn.model_selection import train_test_split #split data into testing and training data
from sklearn.feature_selection import SelectKBest # identify best X that may predict Y
from sklearn.feature_selection import mutual_info_regression #needed for SelectKBest
from sklearn.preprocessing import StandardScaler #handle outliers after selecting K best guess variables that predict Y

# Machine Learning model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

# Extra Questions
# 1. Better Error Measures
# 2. Pre-Recall Curve
# 3. Interpretability of Coefficients
# test_size = power calculation for how close you want to get to precision

## Class Notes

Precision: how many selected items are relevant?

Recall: how many relevant items are selected?

Increase in threshold increases precision; but may reduce the recall

Good eggs versus bad eggs

Precision = 40/59
ReCall = 40/57

_proba predicts probability of an occurrance

predict_ will give the feature


In [2]:
df = pd.read_csv('../data/titanic/train.csv')

In [3]:
df.sample(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
705,706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S
205,206,0,3,"Strom, Miss. Telma Matilda",female,2.0,0,1,347054,10.4625,G6,S


In [4]:
df.info()
# Null values for: Age, Cabin, Embarked
# Classifier for: Name, Sex, Ticket, Cabin, Embarked
# Don't care about: Name, PassengerId, Ticket

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
# Create dummy variables for the strings to expose nulls and see what may need cleaning
dummy_df = pd.get_dummies?

In [None]:
dummy_df = pd.get_dummies

In [6]:
dummy_df = pd.get_dummies(df, dummy_na = True, columns = ['Sex','Cabin','Pclass'])

In [7]:
dummy_df.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05
Embarked,S,C,S,S,S
Sex_female,0,1,1,1,0


In [8]:
# Need to clean up the Cabins; I'm not sure what this line of code did
df.Cabin.str.slice(0,1).unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [9]:
# Null values for: Age, >Cabin, >Embarked, >PClass

df['Cabin']= df.Cabin.str.slice(0,1)
dummy_df = pd.get_dummies(df, prefix='Cabin', dummy_na=True, columns=['Cabin','Embarked','Pclass'])

In [10]:
dummy_df.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin_A,0,0,0,0,0


In [11]:
dummy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 26 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin_A        891 non-null uint8
Cabin_B        891 non-null uint8
Cabin_C        891 non-null uint8
Cabin_D        891 non-null uint8
Cabin_E        891 non-null uint8
Cabin_F        891 non-null uint8
Cabin_G        891 non-null uint8
Cabin_T        891 non-null uint8
Cabin_nan      891 non-null uint8
Cabin_C        891 non-null uint8
Cabin_Q        891 non-null uint8
Cabin_S        891 non-null uint8
Cabin_nan      891 non-null uint8
Cabin_1.0      891 non-null uint8
Cabin_2.0      891 non-null uint8
Cabin_3.0      891 non-null uint8
Cabin_nan      891 non-null uint8
dtyp

In [12]:
# Null values for: >Age, Cabin, Embarked
dummy_df['AgeNull'] = dummy_df.Age.isna()
dummy_df.loc[dummy_df['AgeNull'], 'Age'] = 0

In [13]:
# Drop columns I don't care about
dummy_df.drop(columns='PassengerId Name Ticket'.split(), inplace=True)

In [16]:
# Confirm the data is pretty clean
dummy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 24 columns):
Survived     891 non-null int64
Sex          891 non-null object
Age          891 non-null float64
SibSp        891 non-null int64
Parch        891 non-null int64
Fare         891 non-null float64
Cabin_A      891 non-null uint8
Cabin_B      891 non-null uint8
Cabin_C      891 non-null uint8
Cabin_D      891 non-null uint8
Cabin_E      891 non-null uint8
Cabin_F      891 non-null uint8
Cabin_G      891 non-null uint8
Cabin_T      891 non-null uint8
Cabin_nan    891 non-null uint8
Cabin_C      891 non-null uint8
Cabin_Q      891 non-null uint8
Cabin_S      891 non-null uint8
Cabin_nan    891 non-null uint8
Cabin_1.0    891 non-null uint8
Cabin_2.0    891 non-null uint8
Cabin_3.0    891 non-null uint8
Cabin_nan    891 non-null uint8
AgeNull      891 non-null bool
dtypes: bool(1), float64(2), int64(3), object(1), uint8(17)
memory usage: 57.5+ KB


In [17]:
# Split data into your inputs and outputs

dummy_df_t = dummy_df.copy?



In [18]:
dummy_df_t = dummy_df.copy(deep=True)

In [24]:
X = dummy_df_t.drop(columns='Survived')
y = dummy_df_t['Survived']

In [26]:
# Split data into train, test, and validation sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)