In [1]:
# Exercise 3: Titanic
import pandas as pd
import numpy as np

data = pd.read_csv('titanicTrain.csv') # Reads in the test data
trainSurvived = data['Survived'].values # Separates out the survival data from the input data
trainData = data.drop('Survived', 1)

In [2]:
trainData.isnull().sum() # This shows us that null values are present in Age, Cabin, and Embarked.  These will need to be handled
                        # before proceeding

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# This transformer will delete the columns 'Name', 'Ticket', and 'Cabin'
from sklearn.base import TransformerMixin, BaseEstimator
class DeleteColumns(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X = X.drop('Name', 1) # Removes the Name Column because we already have an ID column
        X = X.drop('Ticket', 1) # Remove the Ticket column because it is not pertinent
        X = X.drop('Cabin', 1) # Removes the Cabin column because it is so sparse
        return X


In [4]:
# This transformer will remove any rows with missing categorical attributes ('Embarked' and 'Sex')
class DeleteRows(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X = X[pd.notnull(X['Embarked'])] # Removes the rows missing values in Embarked
        X = X[pd.notnull(X['Sex'])] # Removes the rows missing values in Sex
        return X # Returns the updated 
    
rowDelete = DeleteRows()
trainRowDelete = rowDelete.transform(trainData)
trainRowDelete.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [5]:
# This transformer will select the appropriate attributes from the dataframe for each pipeline
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values # Returns the columns with the inputted titles

In [6]:
# This will create the Pipeline for the cleaning of the data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
import category_encoders as ce

trainData_Num = ['Pclass', 'Age', 'SibSp', 'Parch'] # This is the list of columns with numerical attributes to be
                                                                    # used in the pipeline
trainData_Cat = ['Sex', 'Embarked'] # A list of the columns with categorical attributes to be used in the pipeline

In [7]:
# PreCleaning Pipeline
    # This pipeline will remove columns that we do not want as well as any rows containing null values
pre_pipeline = Pipeline([
    ('colDelete', DeleteColumns()), # Removes the Name, Ticket, and Cabin columns
    ('rowDelete', DeleteRows()) # Removes any rows containing null values within the columns Embarked and Sex
])

In [8]:
# The Numerical Pipeline
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(trainData_Num)), # Selects the numerical data
    ('imputer', Imputer(strategy = 'median')), # Will replace missing values with the median of the column
    ('std_scaler', StandardScaler()) # Will scale the values
])
numTrain = num_pipeline.fit_transform(trainData)

In [9]:
# The Categorical Pipeline
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(trainData_Cat)),  # Selects the categorical data
    ('1hot', ce.OneHotEncoder()) # Will transform categorical data into numerical
])
catTrain = cat_pipeline.fit_transform(trainData)

In [10]:
# Full Pipeline
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list = [
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

In [11]:
titanic = pd.read_csv('titanicTrain.csv') # Reads in the test data
titanicClean = pre_pipeline.fit_transform(titanic) # Removes rows with null values in 'Sex' and 'Embarked'

y = titanicClean['Survived'] # Separates out the survival data from the input data
X = titanicClean.drop('Survived', 1)

X_prepared = full_pipeline.fit_transform(X) # Sends the pre-cleaned data through the pipeline

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_prepared, y)

In [13]:
# Now we must choose a model to predict the likelihood of survival.  Because we are choosing only between whether or not some
# survived, we will use a binary classifier.

In [14]:
# Ensemble Classifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [15]:
log_clf = LogisticRegression() # Initializes the classifiers

param_grid = [
    {'penalty': ['l1', 'l2'], 'C':[0.01, 0.1, 1.0, 10.0, 100.0]}
]
grid_search = GridSearchCV(log_clf, param_grid, cv = 5, scoring = 'accuracy')
grid_search.fit(X_train, y_train)
log_clf = grid_search.best_estimator_

In [16]:
rf_clf = RandomForestClassifier()

param_grid = [
    {'n_estimators':[500, 1000], 'max_features': [3, 5, 7, 9], 'n_jobs': [-1]}
]

grid_search = GridSearchCV(rf_clf, param_grid, cv = 5, scoring = 'accuracy')
grid_search.fit(X_train, y_train)
rf_clf = grid_search.best_estimator_

In [None]:
svm_clf = SVC()

param_grid = [
    {'probability':[True], 'C': [0.01, 0.1, 1.0, 10.0, 100.0], 'kernel': ['rbf', 'poly', 'sigmoid'], 
     'gamma': [0.01, 0.1, 1.0, 10.0, 100.0, 'auto']}
]
grid_search = GridSearchCV(svm_clf, param_grid, cv = 5, scoring = 'accuracy')
y_train
grid_search.fit(X_train, y_train)
svm_clf = grid_search.best_estimator_

In [None]:
sgd_clf = SGDClassifier()

param_grid = [
    {'loss': ['log'], 'penalty': ['none', 'l1', 'l2', 'elasticnet'], 'alpha': [0.00001, 0.000001],
    'n_jobs': [-1]}
]
grid_search = GridSearchCV(sgd_clf, param_grid, cv = 5, scoring = 'accuracy')
grid_search.fit(X_train, y_train)
sgd_clf = grid_search.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rf_clf), ('svc', svm_clf), ('sgd', sgd_clf)],
    voting = 'soft') # bases the prediction on the highest class probability

for clf in (log_clf, rf_clf, svm_clf, sgd_clf, voting_clf): # Will iterate through each classifier, including the voting classifier
    clf.fit(X_train, y_train) # Fits each classifier to the data
    y_pred = clf.predict(X_test) # Records the prediction of the classifier on the test set
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred)) # Prints the accuracy score