In [1]:
# Data cleaning, exploration, and analysis tools
import pandas as pd
import seaborn as sns
import numpy as np
from ast import literal_eval
import re as re
from matplotlib import pyplot
import scipy.stats as stats
%matplotlib inline

# Hide all warnings
import warnings
warnings.filterwarnings('ignore')


# Import the Machine Learning Libraries

# Data cleaning for machine learning models
from sklearn.model_selection import train_test_split #split data into testing and training data
from sklearn.feature_selection import SelectKBest # identify best X that may predict Y
from sklearn.model_selection import KFold #Cross Validation
from sklearn.feature_selection import mutual_info_regression #needed for SelectKBest
from sklearn.preprocessing import StandardScaler #handle outliers after selecting K best guess variables that predict Y

# Machine Learning model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
#reg = LinearRegression(fit_intercept=True)
#fit_intercept = True; hyper parameter for linear regression, add one-extra term - a start value (a starting weight); rarely False



# Error Measures
from sklearn.dummy import DummyRegressor
# Use DummyRegressor to compare your linear regression to the dumbest possible

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv('../data/titanic/train.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
df.sample(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
852,853,0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,,C
372,373,0,3,"Beavan, Mr. William Thomas",male,19.0,0,0,323951,8.05,,S


In [5]:
#qualitative NaN; dummy variables
ddf = pd.get_dummies(df, prefix='Cabin', dummy_na=True, columns=['Cabin','Embarked','Pclass'])

In [6]:
ddf.head().T
#High cardinality categorical; use Embedding (mostly used in Neural Networks)
# We want to get the first letter of each cabin

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin_A10,0,0,0,0,0


In [7]:
df.Cabin.str.slice(0,1).unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [8]:
df['Cabin']= df.Cabin.str.slice(0,1)
ddf = pd.get_dummies(df, prefix='Cabin', dummy_na=True, columns=['Cabin','Embarked','Pclass'])

In [9]:
ddf.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin_A,0,0,0,0,0


In [10]:
# Deal with the Age NaN; gives all NaNs a value of 0; great for neural network, random forest, linear regression

ddf['AgeNull'] = ddf.Age.isna()
ddf.loc[ddf['AgeNull'], 'Age'] = 0

In [11]:
ddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 27 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin_A        891 non-null uint8
Cabin_B        891 non-null uint8
Cabin_C        891 non-null uint8
Cabin_D        891 non-null uint8
Cabin_E        891 non-null uint8
Cabin_F        891 non-null uint8
Cabin_G        891 non-null uint8
Cabin_T        891 non-null uint8
Cabin_nan      891 non-null uint8
Cabin_C        891 non-null uint8
Cabin_Q        891 non-null uint8
Cabin_S        891 non-null uint8
Cabin_nan      891 non-null uint8
Cabin_1.0      891 non-null uint8
Cabin_2.0      891 non-null uint8
Cabin_3.0      891 non-null uint8
Cabin_nan      891 non-null uint8
AgeN

In [12]:
ddf.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin_A,0,0,0,0,0


In [13]:
ddf.drop(columns='PassengerId Name Ticket'.split(), inplace=True)

In [14]:
ddf.head().T

Unnamed: 0,0,1,2,3,4
Survived,0,1,1,1,0
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin_A,0,0,0,0,0
Cabin_B,0,0,0,0,0
Cabin_C,0,1,0,1,0
Cabin_D,0,0,0,0,0


In [15]:
# Now we split!

from sklearn.model_selection import train_test_split

In [16]:
ddf.Age.fillna(0, inplace=True)
ddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 24 columns):
Survived     891 non-null int64
Sex          891 non-null object
Age          891 non-null float64
SibSp        891 non-null int64
Parch        891 non-null int64
Fare         891 non-null float64
Cabin_A      891 non-null uint8
Cabin_B      891 non-null uint8
Cabin_C      891 non-null uint8
Cabin_D      891 non-null uint8
Cabin_E      891 non-null uint8
Cabin_F      891 non-null uint8
Cabin_G      891 non-null uint8
Cabin_T      891 non-null uint8
Cabin_nan    891 non-null uint8
Cabin_C      891 non-null uint8
Cabin_Q      891 non-null uint8
Cabin_S      891 non-null uint8
Cabin_nan    891 non-null uint8
Cabin_1.0    891 non-null uint8
Cabin_2.0    891 non-null uint8
Cabin_3.0    891 non-null uint8
Cabin_nan    891 non-null uint8
AgeNull      891 non-null bool
dtypes: bool(1), float64(2), int64(3), object(1), uint8(17)
memory usage: 57.5+ KB


In [17]:
train, test = train_test_split(ddf, test_size = .3)

In [18]:
train.count(), test.count()
#This tells me how many are in each set

(Survived     623
 Sex          623
 Age          623
 SibSp        623
 Parch        623
 Fare         623
 Cabin_A      623
 Cabin_B      623
 Cabin_C      623
 Cabin_D      623
 Cabin_E      623
 Cabin_F      623
 Cabin_G      623
 Cabin_T      623
 Cabin_nan    623
 Cabin_C      623
 Cabin_Q      623
 Cabin_S      623
 Cabin_nan    623
 Cabin_1.0    623
 Cabin_2.0    623
 Cabin_3.0    623
 Cabin_nan    623
 AgeNull      623
 dtype: int64, Survived     268
 Sex          268
 Age          268
 SibSp        268
 Parch        268
 Fare         268
 Cabin_A      268
 Cabin_B      268
 Cabin_C      268
 Cabin_D      268
 Cabin_E      268
 Cabin_F      268
 Cabin_G      268
 Cabin_T      268
 Cabin_nan    268
 Cabin_C      268
 Cabin_Q      268
 Cabin_S      268
 Cabin_nan    268
 Cabin_1.0    268
 Cabin_2.0    268
 Cabin_3.0    268
 Cabin_nan    268
 AgeNull      268
 dtype: int64)

In [19]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [20]:
skb = SelectKBest(mutual_info_classif, k=10)
#Usually you want to do hyperparameter testing on what k could be; right now we're just using a random number

In [21]:
train['Sex'] = train.Sex =='male'

In [22]:
train_X = skb.fit_transform(train.drop(columns=['Survived']), train.Survived)

In [23]:
train_X.shape

(623, 10)

In [24]:
train.drop(columns=['Survived']).columns[skb.get_support()]

Index(['Sex', 'SibSp', 'Fare', 'Cabin_C', 'Cabin_D', 'Cabin_nan', 'Cabin_S',
       'Cabin_1.0', 'Cabin_2.0', 'Cabin_3.0'],
      dtype='object')

In [25]:
#train_X.info()

In [26]:
ss = StandardScaler()

In [27]:
quant_features = ss.fit_transform(train_X[:,[1,2,3]])

In [28]:
qual_features = train_X[:, [0,4,5,6,7,8,9]]

In [29]:
import numpy as np

In [30]:
np.concatenate([quant_features,qual_features], axis=1)

array([[-0.46692527, -0.50219086, -0.28565228, ...,  0.        ,
         0.        ,  1.        ],
       [-0.46692527, -0.03596139, -0.28565228, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.3689783 , -0.35767855, -0.28565228, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.46692527, -0.43680376, -0.28565228, ...,  0.        ,
         0.        ,  1.        ],
       [-0.46692527, -0.49126553, -0.28565228, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.3689783 , -0.35858833, -0.28565228, ...,  0.        ,
         0.        ,  1.        ]])

In [31]:
from sklearn.neighbors import KNeighborsClassifier

In [32]:
knn = KNeighborsClassifier(5)
# 5 is arbitrary in this example

In [34]:
knn.fit(train.features, train.Survived)

AttributeError: 'DataFrame' object has no attribute 'features'

In [None]:
knn.score(train_features, train.Survived)

In [None]:
# Now need to transform test features the same way we transformed train features
test['Sex']=test.Sex=='male'
test_X = skb.transform(test.drop(columns=['Survived']))

In [None]:
quant_features = ss.transform(test_X[:,[1,2,3]])

In [None]:
qual_features = test_X[:, [0,4,5,6,7,8,9]]

In [None]:
test_features = np.concatenate([quant_features, qual_features], axis=1)

In [None]:
test_features.shape

In [None]:
knn.score(test_features, test.Survived)

In [None]:
knn.predict(test_features)

In [None]:
knn.predict(test_features) == test.Survived
#returns Trues and Falses

In [None]:
import seaborn as sns

In [None]:
sns.pointplot.(knn.predict(test_features) == test.Survived)