# Training a machine learning model with scikit-learn

In [230]:
# Import  modules
## Data Imports
import numpy as np
import pandas as pd 
## Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [231]:
# Import functions
## Display
from IPython.display import display
## Classification and Regression tools
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import metrics
## Random
from numpy.random import choice
from numpy.random import seed
seed_number = 2015

## Agenda

* What are scikit-learn's four key requirements for working with data?
* What are the four steps for model training and prediction in scikit-learn?

## Get the data

In [232]:
data_path = 'C:/Repositories/Titanic/data/'
titanic_train = pd.read_csv(data_path + 'train_DPP.csv', index_col='PassengerId')
titanic_test  = pd.read_csv(data_path + 'test_DPP.csv',  index_col='PassengerId')
titanic_train.head(6)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,FirstName,LastName,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0,A/5 21171,Mr.,Owen,Braun,
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1,PC 17599,Mrs.,Florence,Cuming,C
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1,STON/O2. 3101282,Miss.,Laina,Heikkine,
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1,113803,Mrs.,Lily,Futrell,C
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0,373450,Mr.,William,Alle,
6,,,Q,8.4583,"Moran, Mr. James",0,3,male,0,0,330877,Mr.,James,Mora,


## Requirements for working with data in scikit-learn

1. Features and response are separate objects
2. Features and response should be numeric
3. Features and response should be NumPy arrays
4. Features and response should have specific shapes

### Step 1: Separate the dataset to $X$ and $y$

In [233]:
X = titanic_train.drop("Survived",1)
y = titanic_train["Survived"]
X_new = titanic_test

### Step 2: Convert $X$ and $y$ to be numeric

In [234]:
## Check the types of the features and response
display(type(X))
display(type(y))

pandas.core.frame.DataFrame

pandas.core.series.Series

In [235]:
## Check columns type
X.dtypes

Age          float64
Cabin         object
Embarked      object
Fare         float64
Name          object
Parch          int64
Pclass         int64
Sex           object
SibSp          int64
Ticket        object
Title         object
FirstName     object
LastName      object
Deck          object
dtype: object

In [236]:
## Converting the Sex column to numeric value
X["Sex"]     = LabelEncoder().fit_transform(X["Sex"])
X_new["Sex"] = LabelEncoder().fit_transform(X_new["Sex"])

In [237]:
## Converting the Title column to numeric value
X["Title"]     = LabelEncoder().fit_transform(X["Title"])
X_new["Title"] = LabelEncoder().fit_transform(X_new["Title"])

In [238]:
## Converting the Deck column
### Assign Null values the T Deck
X["Deck"]     = X["Deck"].fillna("T")
X_new["Deck"] = X_new["Deck"].fillna("T")
Deck_Number = 0
### Change deck letters to floors numbers
for Deck in ['T','A','B','C','D','E','F','G']:
    X.loc[X["Deck"] == Deck, "Deck"] = Deck_Number
    X_new.loc[X_new["Deck"] == Deck, "Deck"] = Deck_Number
    Deck_Number -=1
### Coerce variable to be numeric
X["Deck"]     = X["Deck"].astype(np.int8)
X_new["Deck"] = X_new["Deck"].astype(np.int8)
### Change 0 floor to null
X.loc[X["Deck"] == 0, "Deck"]         = np.nan
X_new.loc[X_new["Deck"] == 0, "Deck"] = np.nan   

In [239]:
## Converting the Embarked column
### Assign Null values the majority class within the train set
majority_class = X["Embarked"].value_counts().idxmax()
X["Embarked"]     = X["Embarked"].fillna(majority_class)
X_new["Embarked"] = X_new["Embarked"].fillna(majority_class)
### Convert categories into numeric
X["Embarked"]     = LabelEncoder().fit_transform(X["Embarked"])
X_new["Embarked"] = LabelEncoder().fit_transform(X_new["Embarked"])

In [240]:
## Fill the missing values in "Age"
median_age = titanic_train["Age"].fillna(titanic_train["Age"].median())
titanic_train["Age"] = median_age
titanic_test["Age"]  = median_age

In [241]:
## Drop unused variables
X.drop(["Cabin","Name","Ticket","FirstName","LastName"],1).head(6)

Unnamed: 0_level_0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Title,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,22.0,2,7.25,0,3,1,1,11,
2,38.0,0,71.2833,0,1,0,1,12,-3.0
3,26.0,2,7.925,0,3,0,0,8,
4,35.0,2,53.1,0,1,0,1,12,-3.0
5,35.0,2,8.05,0,3,1,0,11,
6,,1,8.4583,0,3,1,0,11,


In [242]:
X.dtypes

Age          float64
Cabin         object
Embarked       int64
Fare         float64
Name          object
Parch          int64
Pclass         int64
Sex            int64
SibSp          int64
Ticket        object
Title          int64
FirstName     object
LastName      object
Deck         float64
dtype: object

In [243]:
## scikit-learn 4-step modeling pattern

In [244]:
## Step 1: Import the class you plan to use

## Further reading

* [DataFrameImputer](http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn)
* [03 getting started with iris](https://github.com/justmarkham/scikit-learn-videos/blob/master/03_getting_started_with_iris.ipynb)
* [04 model training](https://github.com/justmarkham/scikit-learn-videos/blob/master/04_model_training.ipynb)