# Modeling: Fit Logistic Regression

In [53]:
from IPython.display import IFrame
IFrame("https://www.kaggle.com/kelvin0815/titanic/first-trial-following-dataquest/code", width='100%', height=250)

In [54]:
# Import  modules
## Data Imports
import numpy as np
import pandas as pd 
## Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [55]:
# Import functions
## Display
from IPython.display import display
## Classification and Regression tools
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn import metrics
## Random
from numpy.random import choice
from numpy.random import seed
seed_number = 2015

## Get the data

In [56]:
# Import files
data_path = 'C:/Repositories/Titanic/data/'
types = {
    'Age':       np.float16,
    'Cabin':     object,
    'Embarked':  object,
    'Fare':      np.float16,
    'Name':      object,
    'Parch':     np.uint8,
    'Pclass':    np.uint8,
    'Sex':       object, #pd.category,
    'SibSp':     np.uint8,
    'Survived':  np.uint8,
    'Ticket':    object,
    'Title':     object,
    'FirstName': object,
    'LastName':  object,
    'Deck':      object
        }
titanic_train = pd.read_csv(data_path + 'train_DPP.csv', index_col='PassengerId', dtype=types)
titanic_test  = pd.read_csv(data_path + 'test_DPP.csv',  index_col='PassengerId', dtype=types)
display(titanic_train.tail())
#display(titanic_test.head())

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,FirstName,LastName,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
887,27.0,,S,13.0,"Montvila, Rev. Juozas",0,2,male,0,0,211536,Rev.,,Montvil,
888,19.0,B42,S,30.0,"Graham, Miss. Margaret Edith",0,1,female,0,1,112053,Miss.,Margaret,Graha,B
889,,,S,23.453125,"Johnston, Miss. Catherine Helen ""Carrie""",2,3,female,1,0,W./C. 6607,Miss.,Catherine,Johnsto,
890,26.0,C148,C,30.0,"Behr, Mr. Karl Howell",0,1,male,0,1,111369,Mr.,Karl,Beh,C
891,32.0,,Q,7.75,"Dooley, Mr. Patrick",0,3,male,0,0,370376,Mr.,Patrick,Doole,


In [57]:
# Check columns type
titanic_train.dtypes

Age          float16
Cabin         object
Embarked      object
Fare         float16
Name          object
Parch          uint8
Pclass         uint8
Sex           object
SibSp          uint8
Survived       uint8
Ticket        object
Title         object
FirstName     object
LastName      object
Deck          object
dtype: object

## Data Preparation

### Data imputation

In [58]:
## Fill the missing values in "Age"
titanic_train["Age"] = titanic_train["Age"].fillna(titanic_train["Age"].median())
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_train["Age"].median())
## Fill the missing values in "Embarked"
titanic_train["Embarked"] = titanic_train["Embarked"].fillna("S")
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

### Data conversion

In [59]:
## Converting the Sex Column to numeric value
titanic_train.loc[titanic_train["Sex"] == "male", "Sex"]   = 0
titanic_train.loc[titanic_train["Sex"] == "female", "Sex"] = 1
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"]   = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
## Converting the Embarked Column
titanic_train.loc[titanic_train["Embarked"] == "S", "Embarked"] = 0
titanic_train.loc[titanic_train["Embarked"] == "C", "Embarked"] = 1
titanic_train.loc[titanic_train["Embarked"] == "Q", "Embarked"] = 2
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

In [60]:
titanic_train.dtypes

Age          float16
Cabin         object
Embarked      object
Fare         float16
Name          object
Parch          uint8
Pclass         uint8
Sex           object
SibSp          uint8
Survived       uint8
Ticket        object
Title         object
FirstName     object
LastName      object
Deck          object
dtype: object

### Split the data

* 70% train set
* 30% test set

In [61]:
X_train,X_test,y_train,y_test = train_test_split(titanic_train.drop("Survived",1), # X
                                                 titanic_train.loc[:,"Survived"],  # y
                                                 test_size = 0.3,
                                                 random_state = seed_number)
# Flatten array converstion
# pandas.core.series.Series --> numpy.ndarray
y_train = np.ravel(y_train)
y_train

array([1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0,

In [62]:
#X_train[["Sex"]].dtypes
#type(y_train)

## Fit Logistic Regression Model

### Gender Model

In [63]:
#from pandas.DataFrame import as_matrix
#as_matrix(X_train[["Sex"]])
mdl_log_gender = LogisticRegression()
mdl_log_gender.fit(X_train[["Sex"]].values,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [64]:
#mdl_model.score(X_train,y_train)