In [151]:
import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [152]:
train_raw = pd.read_csv('/home/david/kaggle/titanic/train.csv')
train = train_raw.copy()
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [153]:
#Data Preprocessing
def cleanData(df):

    df.columns = df.columns.str.strip() #Strip any stray spaces in the column titles

    df = df.drop(['Ticket', 'Name', 'Cabin'], axis=1) #Drop columns that aren't expected to have any predictive power
    df['Age'] = df['Age'].fillna(df['Age'].median()) #Fill Nan values with Median

    #train = train.fillna(0, axis=0) #fill all rows that include null values
    df = df.set_index(df.PassengerId).drop('PassengerId', axis=1)

    
    #Convert categorical features to dummy features
    df['Sex'] = pd.get_dummies(df['Sex'])
    df = pd.concat([df, pd.get_dummies(df['Embarked'])], axis=1) 
    df = df.drop('Embarked', axis=1)
    
    return df.astype(float)

In [154]:
train = cleanData(train)
train = train.dropna(axis=0)
train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,3.0,0.0,22.0,1.0,0.0,7.25,0.0,0.0,1.0
2,1.0,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0
3,1.0,3.0,1.0,26.0,0.0,0.0,7.925,0.0,0.0,1.0
4,1.0,1.0,1.0,35.0,1.0,0.0,53.1,0.0,0.0,1.0
5,0.0,3.0,0.0,35.0,0.0,0.0,8.05,0.0,0.0,1.0


In [155]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
Survived    891 non-null float64
Pclass      891 non-null float64
Sex         891 non-null float64
Age         891 non-null float64
SibSp       891 non-null float64
Parch       891 non-null float64
Fare        891 non-null float64
C           891 non-null float64
Q           891 non-null float64
S           891 non-null float64
dtypes: float64(10)
memory usage: 76.6 KB


In [156]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

#X_fit, X_val, y_fit, y_val = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

In [157]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

SVCpipe = Pipeline([
    ('scaler', StandardScaler()),
    ('linearSVC', LinearSVC(C=1, loss="hinge"))
])

SVCpipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('linearSVC', LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))])

In [159]:
from sklearn.metrics import accuracy_score

print("Training Accuracy: ")
print(accuracy_score(y_train, SVCpipe.predict(X_train)))

Training Accuracy: 
0.786756453423


In [None]:
X_test = pd.read_csv('/home/david/kaggle/titanic/test.csv')
X_test_clean = cleanData(X_test)
X_test_clean = X_test_clean.fillna(0, axis=0) #fill all rows that include null values
X_test_predict = X_test_clean.drop('PassengerId', axis=1)

predictions = SVCpipe.predict(X_test_predict)

outputDF = pd.DataFrame({'PassengerId':X_test_clean.PassengerId, 'Survived':predictions})
outputDF = outputDF.astype(int)

outputDF.to_csv('/home/david/kaggle/titanic/submission.csv')

###Trying to do other things:

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [1, 2, 4, 6, 8, 9]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [1, 2,3,4]},
   ]

forest_clf = RandomForestClassifier()

rand_search = RandomizedSearchCV(forest_clf, param_grid, cv=5, scoring='neg_mean_squared_error')

rand_search.fit(X_train, y_train)

In [None]:
rand_search.cv_results_

In [None]:
rand_search.best_estimator_.feature_importances_

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(SVCpipe, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

print("Scores:", rmse_scores)
print("Mean:", rmse_scores.mean())
print("Standard Deviation:", rmse_scores.std())