In [60]:
import pandas as pd
import re
import numpy as np
import xgboost 
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#Make a copy of test for later use. 
test_orig = test[:]

seperator = train.shape[0]

frames = [train, test]
titanic = pd.concat(frames)

#Delete cabin as it is does not contain enough information
titanic.drop('Cabin', axis=1, inplace=True)

def get_title(name):
    """
    Use a regular expression to search for a title.  Titles always consist of
    capital and lowercase letters, and end with a period.
    
    Takes a name as input and returns the title string as output
    """

    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


titanic["Title"] = titanic["Name"].apply(get_title)  #We dropped "Name" earlier. So, we use original data.

#Condense the title into smaller, and more meaningful categories.
Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royal",
                        "Don":        "Royal",
                        "Sir" :       "Royal",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "Countess":   "Royal",
                        "Dona":       "Royal",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royal"

                        }

def titlemap(x):
    return Title_Dictionary[x]


titanic["Title"] = titanic["Title"].apply(titlemap)


def isRare(title):
    if title == "Mr" or title == "Mrs" or title == "Master" or title == "Miss":
        return 0
    return 1

titanic["Title"] = titanic["Title"].apply(isRare)

#Combing Siblings, Spouses, Parents or children onboard to a single Family variable
titanic["Family"] = titanic["Parch"] + titanic["SibSp"]

#Sex is non-numeric data which can't be handled by our classifier. 
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0    #set male to 0 and female to 1
titanic.loc[titanic["Sex"] == "female", "Sex"] =1

titanic["Sex"] = titanic["Sex"].astype(int)



titanic["Q"] = 0
titanic.loc[titanic["Embarked"] == "Q", "Q"] = 1

titanic["S"] = 0
titanic.loc[titanic["Embarked"] == "S", "S"] = 1



titanic["Fare"].fillna(titanic["Fare"].median(), inplace=True)



def fillAges(row):
    if not(np.isnan(row['Age'])):
        return row['Age']
    
    if row['Sex']=='female' and row['Pclass'] == 1:
        if row['Title'] == 'Miss':
            return 30
        elif row['Title'] == 'Mrs':
            return 45
        elif row['Title'] == 'Officer':
            return 49
        elif row['Title'] == 'Royalty':
            return 39

    elif row['Sex']=='female' and row['Pclass'] == 2:
        if row['Title'] == 'Miss':
            return 20
        elif row['Title'] == 'Mrs':
            return 30

    elif row['Sex']=='female' and row['Pclass'] == 3:
        if row['Title'] == 'Miss':
            return 18
        elif row['Title'] == 'Mrs':                
            return 31

    elif row['Sex']=='male' and row['Pclass'] == 1:
        if row['Title'] == 'Master':
            return 6
        elif row['Title'] == 'Mr':
            return 41.5
        elif row['Title'] == 'Officer':
            return 52
        elif row['Title'] == 'Royalty':
            return 40

    elif row['Sex']=='male' and row['Pclass'] == 2:
        if row['Title'] == 'Master':
            return 2
        elif row['Title'] == 'Mr':
            return 30
        elif row['Title'] == 'Officer':
                return 41.5

    elif row['Sex']=='male' and row['Pclass'] == 3:
        if row['Title'] == 'Master':
            return 6
        elif row['Title'] == 'Mr':
            return 26
        
titanic["Age"] = titanic.apply(fillAges, axis = 1)

titanic["Child"] = 0
titanic.loc[titanic["Age"] <= 18, "Child"] = 1


#The predictors that we are going to use
predictors = ["Q","S", "Fare", "Pclass", "Sex", "Family", "Title", "Child"]

#Break the combined data set into test and train data
target = titanic["Survived"].iloc[:seperator]
train = titanic[predictors][:seperator]
test = titanic[predictors][seperator:]


#Build an ensemble of classifiers. Hyper-parameters chosen through cross validation
xgb = xgboost.XGBClassifier(learning_rate = 0.05, n_estimators=500);
logreg = LogisticRegression(penalty = 'l2', C = .5,random_state = 0)
rfc  = RandomForestClassifier(n_estimators=40,random_state=0);
lsvc = LinearSVC(random_state=0);
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svmc = svm.SVC(C = 5, probability = True)
#fit the data
xgb.fit(train, target)
svmc.fit(train, target)
logreg.fit(train, target)
rfc.fit(train,target)
lsvc.fit(train,target)

xgb_preds = xgb.predict_proba(test).transpose()[1]
svmc_preds = svmc.predict_proba(test).transpose()[1]
logreg_preds = logreg.predict_proba(test).transpose()[1]
rfc_preds = rfc.predict_proba(test).transpose()[1]
#lsvc_preds = lsvc.predict_proba(test).transpose()[1]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [61]:
#Assign different weightages to the classifiers
ensemble_preds = xgb_preds + svmc_preds + logreg_preds + rfc_preds
ensemble_preds = ensemble_preds/4
for x in range(len(ensemble_preds)):
    if ensemble_preds[x] >= 0.5:
        ensemble_preds[x] = 1
    else:
        ensemble_preds[x] = 0



results  = ensemble_preds.astype(int)

#Generate the final submission file.
submission = pd.DataFrame({"PassengerId": test_orig["PassengerId"], "Survived": results}) 
submission.to_csv("kaggle1.csv", index=False)