In [2]:
%load_ext autoreload
%autoreload 2

In [231]:
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

from dumbrain.ml.kaggle.download import download

%matplotlib inline

In [4]:
data_files = download( 'titanic' )
test_file, train_file, example_output = list( map( lambda file: 'data/' + file, data_files ) )

In [5]:
all_train_data = pd.read_csv( train_file )
all_test_data = pd.read_csv( test_file )
all_example_data = pd.read_csv( example_output )

In [6]:
import abc

class DataCleaner( metaclass=abc.ABCMeta ):
    def __init__( self ):
        pass

    @abc.abstractmethod
    def clean( self, data ):
        pass

class ColumnCleaner( DataCleaner ):
    def __init__( self, column_name ):
        super( ColumnCleaner, self ).__init__()
        self.column_name = column_name

class DummyColumnCleaner( ColumnCleaner ):
    def __init__( self, column_name, all_values ):
        super( DummyColumnCleaner, self ).__init__( column_name )
        self.all_values = all_values

    def clean( self, data ):
        data = data.copy()
        dummy_cols = pd.get_dummies( data[ self.column_name ], prefix='' )
        dummy_cols = dummy_cols.add_suffix( '_' + self.column_name )
        data = data.join( dummy_cols, rsuffix=self.column_name )
        data = data.drop( self.column_name, axis=1 )
        return data

class RemoveColumnCleaner( ColumnCleaner ):
    def clean( self, data ):
        if self.column_name in data.columns:
            return data.drop( self.column_name, axis=1 )
        return data

class FilterDataCleaner( DataCleaner ):
    def __init__( self, filter_func ):
        super( FilterClenaer, self ).__init__( column_name )
        self.filter_func = filter_func

    def clean( self, data ):
        return data[ self.filter_func( data ) ]

class FillNaNDataCleaner( DataCleaner ):
    def __init__( self, new_value ):
        super( FillNaNDataCleaner, self ).__init__()
        self.new_value = new_value

    def clean( self, data ):
        return data.fillna( self.new_value )

In [204]:
class NameSentimentColumnCleaner( ColumnCleaner ):
    def __init__( self, column_name, train_data, score_column_name, omit=[], minlen=2 ):
        super( NameSentimentColumnCleaner, self ).__init__( column_name )
        self.train_data = train_data
        self.score_column_name = score_column_name
        self.omit = omit
        self.minlen = minlen
        self.generateSentiments()

    def tokenize( self, string ):
        strings_to_remove = [ '.', ',', '(', ')', "'", '"' ] + self.omit
        for string_to_remove in strings_to_remove:
            string = string.replace( string_to_remove, '' )
        string = string.lower()
        tokens = string.split( ' ' )
        filtered_tokens = []
#         for token in tokens:
#             if token
        tokens = filter( lambda item: len( item ) >= self.minlen, tokens )
        return list( tokens )

    def generateSentiments( self ):
        tokens = []
        scores = []
        for i, row in self.train_data.iterrows():
            for token in self.tokenize( row[ self.column_name ] ):
                tokens.append( token )
                scores.append( row[ self.score_column_name ] )
        df = pd.DataFrame( { 'tokens': tokens, 'scores': scores } )
        grouped = df.groupby( 'tokens' ).mean()
        grouped = grouped[ df.groupby( 'tokens' ).count()[ 'scores' ] > 5 ]
        self.sentiments = grouped

    def clean( self, data ):
        data = data.copy()
        
        scores = []
        for i, row in data.iterrows():
            tokens = self.tokenize( row[ self.column_name ] )
            score = self.sentiments.iloc[ self.sentiments.index.isin( tokens ) ].mean()[ 'scores' ]
            scores.append( score )
    
        data[ self.column_name + '_score' ] = scores
        return data

In [205]:
train_data_uncleaned = all_train_data.sample( frac=0.8, random_state=4111 )
validate_data_uncleaned = all_train_data.drop( train_data.index )

In [250]:
cleaners = [
    RemoveColumnCleaner( 'PassengerId' ),
#     NameSentimentColumnCleaner( 'Name', train_data_uncleaned, 'Survived', omit=[ 'mrs', 'miss', 'mr' ] ),
    RemoveColumnCleaner( 'Name' ),  
    DummyColumnCleaner( 'Sex', [ 'male', 'female' ] ),
    RemoveColumnCleaner( 'Sex' ),
    DummyColumnCleaner( 'Pclass', [ 1, 2, 3 ] ),
    RemoveColumnCleaner( 'Ticket' ),                     # Todo: Use this data
    RemoveColumnCleaner( 'Cabin' ),                      # Todo: Use this data
    DummyColumnCleaner( 'Embarked', [ 'S', 'C', 'Q' ] ),
    FillNaNDataCleaner( 0 )
]

def cleanData( _cleaners, _data ):
    for cleaner in _cleaners:
        _data = cleaner.clean( _data )
    return _data

train_data = cleanData( cleaners, train_data_uncleaned )
validate_data = cleanData( cleaners, validate_data_uncleaned )

In [251]:
model = RandomForestClassifier()

In [252]:
train_data.columns

Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', '_female_Sex', '_male_Sex',
       '_1_Pclass', '_2_Pclass', '_3_Pclass', '_C_Embarked', '_Q_Embarked',
       '_S_Embarked'],
      dtype='object')

In [253]:
model.fit( train_data.drop( 'Survived', axis=1 ), train_data[ 'Survived' ] )



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [254]:
model.score( validate_data.drop( 'Survived', axis=1 ), validate_data[ 'Survived' ] )

0.8258426966292135

In [255]:
test_data_cleaned = cleanData( cleaners, all_test_data )
predicted = model.predict( test_data_cleaned )

output = pd.DataFrame( { 'PassengerId': all_test_data[ 'PassengerId' ], 'Survived': predicted } )
output = output.set_index( 'PassengerId' )

In [256]:
output.to_csv( 'data/results.csv' )