# Training a machine learning model with scikit-learn

In [216]:
# Import  modules
## Data Imports
import numpy as np
import pandas as pd 
## Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [217]:
# Import functions
## Display
from IPython.display import display
## Classification and Regression tools
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import metrics
## Random
from numpy.random import choice
from numpy.random import seed
seed_number = 2015

## Get the data

* What are scikit-learn's four key requirements for working with data?
* What are the four steps for model training and prediction in scikit-learn?

In [218]:
data_path = 'C:/Repositories/Titanic/data/'
titanic_train = pd.read_csv(data_path + 'train_DPP.csv', index_col='PassengerId')
titanic_test  = pd.read_csv(data_path + 'test_DPP.csv',  index_col='PassengerId')
titanic_train.head(6)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,FirstName,LastName,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0,A/5 21171,Mr.,Owen,Braun,
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1,PC 17599,Mrs.,Florence,Cuming,C
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1,STON/O2. 3101282,Miss.,Laina,Heikkine,
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1,113803,Mrs.,Lily,Futrell,C
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0,373450,Mr.,William,Alle,
6,,,Q,8.4583,"Moran, Mr. James",0,3,male,0,0,330877,Mr.,James,Mora,


## Agenda

* What are scikit-learn's four key requirements for working with data?
* What are the four steps for model training and prediction in scikit-learn?

## Requirements for working with data in scikit-learn

1. Features and response are separate objects
2. Features and response should be numeric
3. Features and response should be NumPy arrays
4. Features and response should have specific shapes

### Step 1: Separate the dataset to $X$ and $y$

In [219]:
X = titanic_train.drop("Survived",1)
y = titanic_train["Survived"]
X_new = titanic_test

### Step 2: Convert $X$ and $y$ to be numeric

In [220]:
## Check the types of the features and response
display(type(X))
display(type(y))
display(type(X_new))

pandas.core.frame.DataFrame

pandas.core.series.Series

pandas.core.frame.DataFrame

In [221]:
## Check columns type
display(X.dtypes)
display(X_new.dtypes)

Age          float64
Cabin         object
Embarked      object
Fare         float64
Name          object
Parch          int64
Pclass         int64
Sex           object
SibSp          int64
Ticket        object
Title         object
FirstName     object
LastName      object
Deck          object
dtype: object

Age          float64
Cabin         object
Embarked      object
Fare         float64
Name          object
Parch          int64
Pclass         int64
Sex           object
SibSp          int64
Ticket        object
Title         object
FirstName     object
LastName      object
Deck          object
dtype: object

In [222]:
## Converting the Sex column to numeric value
X["Sex"]     = LabelEncoder().fit_transform(X["Sex"])
X_new["Sex"] = LabelEncoder().fit_transform(X_new["Sex"])

In [223]:
## Converting the Title column to numeric value
X["Title"]     = LabelEncoder().fit_transform(X["Title"])
X_new["Title"] = LabelEncoder().fit_transform(X_new["Title"])

In [224]:
## Converting the Deck column
### Assign Null values the T Deck
X["Deck"]     = X["Deck"].fillna("T")
X_new["Deck"] = X_new["Deck"].fillna("T")
Deck_Number = 0
### Change deck letters to floors numbers
for Deck in ['T','A','B','C','D','E','F','G']:
    X.loc[X["Deck"] == Deck, "Deck"] = Deck_Number
    X_new.loc[X_new["Deck"] == Deck, "Deck"] = Deck_Number
    Deck_Number -=1
### Coerce variable to be numeric
X["Deck"]     = X["Deck"].astype(np.int8)
X_new["Deck"] = X_new["Deck"].astype(np.int8)
### Change 0 floor to null
X.loc[X["Deck"] == 0, "Deck"]         = np.nan
X_new.loc[X_new["Deck"] == 0, "Deck"] = np.nan   

In [225]:
## Converting the Embarked column
### Assign Null values the majority class within the train set
majority_class    = X["Embarked"].value_counts().idxmax()
X["Embarked"]     = X["Embarked"].fillna(majority_class)
X_new["Embarked"] = X_new["Embarked"].fillna(majority_class)
### Convert categories into numeric
X["Embarked"]     = LabelEncoder().fit_transform(X["Embarked"])
X_new["Embarked"] = LabelEncoder().fit_transform(X_new["Embarked"])

In [226]:
## Fill the missing values in "Age"
X["Age"] = X["Age"].fillna(X["Age"].median())
X_new["Age"] = X_new["Age"].fillna(X["Age"].median())

In [227]:
## Drop unused variables
var2drop = ["Cabin","Name","Ticket","FirstName","LastName","Deck"]
X        = X.drop(var2drop,1)
X_new    = X_new.drop(var2drop,1)

X.head(6)

Unnamed: 0_level_0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,22,2,7.25,0,3,1,1,11
2,38,0,71.2833,0,1,0,1,12
3,26,2,7.925,0,3,0,0,8
4,35,2,53.1,0,1,0,1,12
5,35,2,8.05,0,3,1,0,11
6,28,1,8.4583,0,3,1,0,11


In [228]:
X.dtypes

Age         float64
Embarked      int64
Fare        float64
Parch         int64
Pclass        int64
Sex           int64
SibSp         int64
Title         int64
dtype: object

### Step 3: Convert $X$ and $y$ to be NumPy arrays

In [229]:
# Convert the frame to its Numpy-array representation.
X     = X.as_matrix(columns=None)
X_new = X_new.as_matrix(columns=None)
# Flatten array converstion
# pandas.core.series.Series --> numpy.ndarray
y = np.ravel(y)

display(type(X))
display(type(y))
display(type(X_new))

numpy.ndarray

numpy.ndarray

numpy.ndarray

### Step 4: Validate that $X$ and $y$ have specific shapes

In [230]:
display(X.shape)
display(y.shape)
display(X_new.shape)

(891, 8)

(891,)

(418, 8)

## scikit-learn 4-step modeling pattern

### Step 1: Import the class you plan to use

In [231]:
## KNN
from sklearn.neighbors import KNeighborsClassifier

### Step 2: "Instantiate" the "estimator"

* "Estimator" is scikit-learn's term for model
* "Instantiate" means "make an instance of"

In [232]:
## KNN
mdl_knn = KNeighborsClassifier(n_neighbors=5)

* Name of the object does not matter
* Can specify tuning parameters (aka "hyperparameters") during this step
* All parameters not specified are set to their defaults

In [233]:
print(mdl_knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')


### Step 3: Fit the model with data (aka "model training")

* Model is learning the relationship between X and y
* Occurs in-place

In [234]:
mdl_knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')

### Step 4: Predict the response for a new observation

* New observations are called "out-of-sample" data
* Uses the information it learned during the model training process

In [235]:
y_hat = mdl_knn.predict(X_new)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Further reading

* [DataFrameImputer](http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn)
* [03 getting started with iris](https://github.com/justmarkham/scikit-learn-videos/blob/master/03_getting_started_with_iris.ipynb)
* [04 model training](https://github.com/justmarkham/scikit-learn-videos/blob/master/04_model_training.ipynb)