In [1]:
import kaggle
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import tree
from sklearn import ensemble



In [37]:
def feature_engineer(df):
    titles = []
    for name in df['Name']:
        title = name.split(',')[-1].split('.')[0]
        titles.append(title[1:])
    df['Name'] = titles
    df.rename(columns = {'Name':'Title'}, inplace = True)

    decks = []
    for cabin in df['Cabin']:
        if pd.notna(cabin): decks.append(cabin[0])
        else: decks.append(cabin)
    df['Cabin'] = decks
    df.rename(columns = {'Cabin':'Deck'}, inplace = True)

    fam_size  = []
    for i in range(len(df)):
        fam_size.append(df['SibSp'][i]+df['Parch'][i])
    df['FamSize'] = fam_size

    fare_per_person = []
    for i in range(len(df)):
        fare_per_person.append(df['Fare'][i]/(df['FamSize'][i]+1))
    df['FarePerPerson'] = fare_per_person
    
    
    age_times_class = []
    for i in range(len(df)):
        age_times_class.append(df['Pclass'][i]*df['Age'][i])
    df['AgeClass'] = age_times_class
    
    return df

In [21]:
def categorify(df, columns):
    rectifier = 1
    for column in columns:
        category_dict = {}
        unique_categories = df[column].unique()
        unknown_index = (unique_categories != unique_categories).nonzero()[0]
        unique_categories = unique_categories.tolist()

        if len(unknown_index): 
            category_indexes = list(range(len(unique_categories)))
            unknown = unique_categories[unknown_index[0]]
            category_indexes[0] = unknown
            unique_categories.pop(unknown_index[0])
            unique_categories.insert(0,unknown)

        else:
            category_indexes = list(range(1,len(unique_categories)+1))

        for i in range(len(unique_categories)):
            category_dict[unique_categories[i]] = category_indexes[i]

        new_column = []
        for i in range(len(df)):
            new_value = category_dict[df[column][i]]
            if new_value == new_value: new_value = int(new_value)
            new_column.append(new_value)

        df[column] = new_column
#         df[column] = new_column.astype('category')
    return df

In [22]:
def fill_missing(df):
    for column in df.columns:
        if df[column].isna().any():
            median = df[column].describe()['50%']
            nan_indexes = df[column].isna().values
            new_column = np.array(df[column])
            new_column[nan_indexes] = median
#             new_column = new_column.astype('int')
            df[column] = new_column
            df[f'{column}Missing'] = nan_indexes 
    return df

In [23]:
def integerize(df, columns):
    for column in columns:
        df[column] = df[column].astype('int')
    return df

In [24]:
path = '/home/francisco/workspace/titanic_kaggle/titanic'

In [38]:
train_df = pd.read_csv(f'{path}/train.csv',low_memory=False)
test_df = pd.read_csv(f'{path}/test.csv',low_memory=False)

In [39]:
train_df = feature_engineer(train_df)
test_df = feature_engineer(test_df)

In [40]:
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

In [43]:
cont = ['Age', 'FarePerPerson']
cat = ['Survived', 'Pclass', 'Title', 'Sex', 'SibSp' , 'Parch' , 'Deck', 'Embarked', 'FamSize', 'AgeClassMissing','AgeMissing', 'FareMissing','FarePerPersonMissing','DeckMissing', 'EmbarkedMissing']

In [44]:
train_df = categorify(train_df,['Title', 'Sex', 'Deck', 'Embarked'])
test_df = categorify(test_df,['Title', 'Sex', 'Deck', 'Embarked'])
train_df = fill_missing(train_df)
test_df = fill_missing(test_df)
train_df['FarePerPersonMissing'] = np.array([False for i in range(len(train_df))])
train_df['FareMissing'] = np.array([False for i in range(len(train_df))])
train_df = integerize(train_df,cat)
test_df['EmbarkedMissing'] = np.array([False for i in range(len(test_df))])
test_df = integerize(test_df,cat[1:])

In [47]:
# clf = tree.DecisionTreeClassifier()
rf = ensemble.RandomForestClassifier(n_estimators=100000)

In [48]:
# clf.fit(train_df.values[:,2:],train_df.values[:,1])
rf.fit(train_df.values[:,2:],train_df.values[:,1])

RandomForestClassifier(n_estimators=100000)

In [49]:
# preds_clf = clf.predict(test_df.values[:,1:])
preds_rf = rf.predict(test_df.values[:,1:])

In [50]:
# preds_df = pd.DataFrame()
preds_rf_df = pd.DataFrame()

In [51]:
# preds_df['PassengerId'] = test_df['PassengerId'].values
# preds_df['Survived'] = preds_clf.astype('int')
preds_rf_df['PassengerId'] = test_df['PassengerId'].values
preds_rf_df['Survived'] = preds_rf.astype('int')

In [52]:
# preds_rf_df

In [53]:
preds_rf_df.to_csv(f'{path}/submission.csv',index=False)

***

In [54]:
preds_train = rf.predict(train_df.values[:,2:])

In [55]:
gt = train_df['Survived'].values

In [56]:
np.count_nonzero(preds_train == gt)/len(gt)

0.9876543209876543