In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(42)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import roc_auc_score
from sklearn.tree import _tree
from sklearn.neighbors import KNeighborsClassifier as KNN

from my_functions import f, f_gini, tree_to_code
from my_functions import make_info_table, create_bins_for_column, bins_transfer

import pickle

pd.set_option('max_columns', 1000)
pd.set_option('max_rows', 1000)

In [57]:
pd.__version__

'0.22.0'

In [58]:
train = pd.read_csv('input/train.csv')
test_sub = pd.read_csv('input/test.csv')
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [59]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [60]:
train.shape, test_sub.shape

((891, 12), (418, 11))

In [61]:
def name_pref_f(s):
    if s.find(', ') >= 0:
        s2 = s[(s.find(', ') + 2):]
        s3 = s2[:(s2.find(' '))]
    else:
        0
    return s3

In [62]:
#!!! Name prefix
print(pd.concat([train['Name'], train['Name']], axis = 0).apply(lambda x: name_pref_f(x)).unique())
pd.concat([train['Name'], train['Name']], axis = 0).apply(lambda x: name_pref_f(x)).value_counts()

['Mr.' 'Mrs.' 'Miss.' 'Master.' 'Don.' 'Rev.' 'Dr.' 'Mme.' 'Ms.' 'Major.'
 'Lady.' 'Sir.' 'Mlle.' 'Col.' 'Capt.' 'the' 'Jonkheer.']


Mr.          1034
Miss.         364
Mrs.          250
Master.        80
Dr.            14
Rev.           12
Mlle.           4
Major.          4
Col.            4
the             2
Jonkheer.       2
Mme.            2
Lady.           2
Capt.           2
Ms.             2
Sir.            2
Don.            2
Name: Name, dtype: int64

In [63]:
def name_pref_f2(s):    
    s2 = np.float('NaN')
    for i, item in enumerate(['Mr.', 'Master.', 'Dr.', 'Rev.', 'Miss.', 'Mrs.']):
        if s.find(item) >= 0:
            s2 = i + 1
    return s2

In [64]:
train['Name_prefix'] = train['Name'].apply(lambda x: name_pref_f2(x))
test_sub['Name_prefix'] = test_sub['Name'].apply(lambda x: name_pref_f2(x))

In [65]:
target_col = 'Survived'
columns_to_drop = ['Name', 'Ticket', 'Cabin']

train = train.drop(columns_to_drop, axis = 1)
test_sub = test_sub.drop(columns_to_drop, axis = 1)

In [66]:
X_init = pd.get_dummies(train.drop(target_col, axis = 1))
X_sub = pd.get_dummies(test_sub)
Y_init = train[target_col]

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_init, Y_init, test_size = 0.4)

In [68]:
for item in [X_train, X_test, y_train, y_test, X_sub]:
    print('shape: {}'.format(item.shape))

shape: (534, 12)
shape: (357, 12)
shape: (534,)
shape: (357,)
shape: (418, 12)


In [69]:
X_train.columns == X_test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True], dtype=bool)

In [70]:
X_train.columns == X_sub.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True], dtype=bool)

In [71]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Name_prefix,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
570,571,2,62.0,0,0,10.5,1.0,0,1,0,0,1
787,788,3,8.0,4,1,29.125,2.0,0,1,0,1,0
74,75,3,32.0,0,0,56.4958,1.0,0,1,0,0,1
113,114,3,20.0,1,0,9.825,5.0,1,0,0,0,1
635,636,2,28.0,0,0,13.0,5.0,1,0,0,0,1


In [73]:
'''X_train.to_pickle('pkl/X_train.pkl')
X_test.to_pickle('pkl/X_test.pkl')
y_train.to_pickle('pkl/y_train.pkl')
y_test.to_pickle('pkl/y_test.pkl')
X_sub.to_pickle('pkl/X_sub.pkl')'''

In [74]:
'''X_train = pd.read_pickle('pkl/X_train.pkl')
X_test = pd.read_pickle('pkl/X_test.pkl')
y_train = pd.read_pickle('pkl/y_train.pkl')
y_test = pd.read_pickle('pkl/y_test.pkl')
X_sub = pd.read_pickle('pkl/X_sub.pkl')'''

In [75]:
X_train = X_train.reset_index(drop = True)
X_test  = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test  = y_test.reset_index(drop = True)

In [76]:
#?
X_sub  = X_sub.reset_index(drop = True)

In [77]:
fts_corr = X_train.columns

In [78]:
'''X_train = X_train.loc[:, fts_corr]
X_test  = X_test.loc[:, fts_corr]'''

'X_train = X_train.loc[:, fts_corr]\nX_test  = X_test.loc[:, fts_corr]'

In [79]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((534, 12), (534,), (357, 12), (357,))

#### #Binning

In [80]:
X_train.head(5)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Name_prefix,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,571,2,62.0,0,0,10.5,1.0,0,1,0,0,1
1,788,3,8.0,4,1,29.125,2.0,0,1,0,1,0
2,75,3,32.0,0,0,56.4958,1.0,0,1,0,0,1
3,114,3,20.0,1,0,9.825,5.0,1,0,0,0,1
4,636,2,28.0,0,0,13.0,5.0,1,0,0,0,1


In [81]:
#Make a dictionary with column names as keys 
#and names of binned columns as values

#!!! Long operation (3 min) - uncomment if there is no pikcle with this dictionary
bin_col_names = {}

for col in X_train.columns:
    bin_col_names[col] = create_bins_for_column(X_train, col, q_qty = 8)[3]

with open('bin_col_names.pickle', 'wb') as handle:
    pickle.dump(bin_col_names, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('bin_col_names.pickle', 'rb') as handle:
    bin_col_names = pickle.load(handle)

In [82]:
bin_col_names

{'Age': ['Age=NaN',
  'Age=(0.419, 15.25]',
  'Age=(15.25, 20.0]',
  'Age=(20.0, 24.0]',
  'Age=(24.0, 28.0]',
  'Age=(28.0, 32.0]',
  'Age=(32.0, 37.0]',
  'Age=(37.0, 47.0]',
  'Age=(47.0, 80.0]'],
 'Embarked_C': ['Embarked_C=NaN', 'Embarked_C=0', 'Embarked_C=(0.999, 1.0]'],
 'Embarked_Q': ['Embarked_Q=NaN', 'Embarked_Q=0', 'Embarked_Q=(0.999, 1.0]'],
 'Embarked_S': ['Embarked_S=NaN', 'Embarked_S=(-0.001, 0.0]', 'Embarked_S=1'],
 'Fare': ['Fare=NaN',
  'Fare=(-0.001, 7.75]',
  'Fare=(7.75, 7.896]',
  'Fare=(7.896, 9.577]',
  'Fare=(9.577, 14.454]',
  'Fare=(14.454, 23.519]',
  'Fare=(23.519, 30.5]',
  'Fare=(30.5, 69.55]',
  'Fare=(69.55, 512.329]'],
 'Name_prefix': ['Name_prefix=NaN',
  'Name_prefix=1.0',
  'Name_prefix=2.0',
  'Name_prefix=(2.999, 3.5]',
  'Name_prefix=(3.5, 4.0]',
  'Name_prefix=5.0',
  'Name_prefix=6.0'],
 'Parch': ['Parch=NaN',
  'Parch=0',
  'Parch=1',
  'Parch=2',
  'Parch=(2.999, 3.667]',
  'Parch=(3.667, 4.333]',
  'Parch=(4.333, 5.0]'],
 'PassengerId': ['Pa

In [83]:
def make_cross_variable(df, bin_col_names, col1, col2):
    if col1 == col2:
        print('Features are the same! Operation stopped.')
        return
        
    tmp_df1 = bins_transfer(df, col1, bin_col_names[col1])[0]
    tmp_df2 = bins_transfer(df, col2, bin_col_names[col2])[0]
        
    df3 = pd.DataFrame()
        
    for bc1 in bin_col_names[col1]:
        for bc2 in bin_col_names[col2]:
            bin_cross_col_name = bc1 + '*' + bc2
            df3[bin_cross_col_name] = ((tmp_df1[bc1] == 1) & (tmp_df2[bc2] == 1)).astype(np.int64)

    assert df3.sum().sum() == df3.shape[0]
    
    return df3

In [84]:
ignore_col_list = ['PassengerId']

In [85]:
X_fit_pre = pd.concat([X_train, X_test], axis = 0).sort_index()
y_fit_pre = pd.concat([y_train, y_test], axis = 0).sort_index()
X_fit_pre.shape, y_fit_pre.shape

((891, 12), (891,))

In [158]:
#Make cross-variables
from time import time

used_cols = []
used_cross_cols = []
current_dfs = []
max_df_train = None
max_df_test = None
current_df_train = None
current_df_test = None

log_file = open('log_{}.txt'.format(np.round(time()).astype(np.int64)), 'w')

for iteration in range(7):
    
    print('\n\nITERATION: {}'.format(iteration + 1), end = '\n\n')
    log_file.write('\n\nITERATION: {}\n'.format(iteration + 1))
    
    max_accuracy_fit_pre = 0
    max_accuracy_test = 0
    max_accuracy_ft = None
    max_col1, max_col2 = None, None
    
    iter2 = 0
    checked_cols_lst = []
    
    for col1 in X_fit_pre.columns:
        for col2 in X_fit_pre.columns:
            if col1 == col2 or col1 in used_cols or col2 in used_cols or [col1, col2] in checked_cols_lst \
                                            or col1 in ignore_col_list or col2 in ignore_col_list:
                continue
                
            checked_cols_lst.append([col1, col2])
            
            iter2 += 1
            
            cross_col_name = '{}*{}'.format(col1, col2)

            tmp_df_train = make_cross_variable(X_fit_pre, bin_col_names, col1, col2)
            tmp_df_test  = make_cross_variable(X_test,  bin_col_names, col1, col2)
            
            

            if iteration > 0:
                tmp_df_train = pd.concat([current_dfs[0],  tmp_df_train], axis = 1)
                tmp_df_test  = pd.concat([current_dfs[1],  tmp_df_test], axis = 1)
            
            clf = KNN(8)
            
            
            #clf.fit(tmp_df_train, y_fit_pre.sort_index())
            #y_pred_train = clf.predict(tmp_df_train)
            #y_pred_test  = clf.predict(tmp_df_test)

            accuracy_fit_pre = cross_val_score(clf, tmp_df_train, y_fit_pre, cv = 5)
            accuracy_fit_pre_mean = np.mean(np.array(accuracy_fit_pre))
            #accuracy_test  = clf.score(tmp_df_test, y_test)

            #print('{}. cross_col_name: {}'.format(iter2, cross_col_name))
            log_file.write('{}. cross_col_name: {}\n'.format(iter2, cross_col_name))
            #print('accuracy_fit_pre: {}\naccuracy_test: {}'.format(accuracy_fit_pre, accuracy_test), end = '\n\n')
            log_file.write('accuracy_fit_pre: {}, \naccuracy_fit_pre_mean: {}\n\n'\
                                                .format(accuracy_fit_pre, accuracy_fit_pre_mean))

            if accuracy_fit_pre_mean > max_accuracy_test:
                max_col1 = col1
                max_col2 = col2
                
                max_accuracy_fit_pre = accuracy_fit_pre
                max_accuracy_test  = accuracy_fit_pre_mean
                
                max_accuracy_ft = cross_col_name
                
                max_df_train = tmp_df_train
                max_df_test = tmp_df_test


    current_df_train = max_df_train.copy()
    current_df_test  = max_df_test.copy()
        
    current_dfs = [current_df_train, current_df_test]
        
    if  max_col1 is None or max_col2 is None:
        break

    used_cols.extend([max_col1, max_col2])
    used_cross_cols.append(max_accuracy_ft)
    
    #print('used cols: {}'.format(used_cols), end = '\n\n')
    log_file.write('used cols: {}\n'.format(used_cols))
    #print('used_cross_cols cols: {}'.format(used_cross_cols), end = '\n\n')
    log_file.write('used_cross_cols cols: {}\n\n'.format(used_cross_cols))
    
    #print('MAX: accuracy_fit_pre: {}, accuracy_test: {}'.format(max_accuracy_fit_pre, max_accuracy_test), end = '\n\n')
    log_file.write('max_accuracy_fit_pre: {}, max_accuracy_test: {}\n\n'.format(max_accuracy_fit_pre, max_accuracy_test))



ITERATION: 1



ITERATION: 2



ITERATION: 3



ITERATION: 4



ITERATION: 5



ITERATION: 6



In [159]:
log_file.close()

In [196]:
#list copied from the log (with the best gini)


great_cross_features = ['SibSp', 'Name_prefix', 
                        'Sex_female', 'Pclass', 
                        'Sex_male', 'Age']

In [197]:
#bins dict for submission
bin_col_names_sub = {}

for item in great_cross_features:
    bin_col_names_sub[item] = bin_col_names[item]

In [198]:
bin_col_names_sub

{'Age': ['Age=NaN',
  'Age=(0.419, 15.25]',
  'Age=(15.25, 20.0]',
  'Age=(20.0, 24.0]',
  'Age=(24.0, 28.0]',
  'Age=(28.0, 32.0]',
  'Age=(32.0, 37.0]',
  'Age=(37.0, 47.0]',
  'Age=(47.0, 80.0]'],
 'Name_prefix': ['Name_prefix=NaN',
  'Name_prefix=1.0',
  'Name_prefix=2.0',
  'Name_prefix=(2.999, 3.5]',
  'Name_prefix=(3.5, 4.0]',
  'Name_prefix=5.0',
  'Name_prefix=6.0'],
 'Pclass': ['Pclass=NaN', 'Pclass=1', 'Pclass=(1.999, 2.0]', 'Pclass=3'],
 'Sex_female': ['Sex_female=NaN', 'Sex_female=0', 'Sex_female=(0.999, 1.0]'],
 'Sex_male': ['Sex_male=NaN', 'Sex_male=(-0.001, 0.0]', 'Sex_male=1'],
 'SibSp': ['SibSp=NaN',
  'SibSp=0',
  'SibSp=1',
  'SibSp=2',
  'SibSp=3',
  'SibSp=4',
  'SibSp=(4.999, 5.0]',
  'SibSp=8']}

In [199]:
X_fit = pd.concat([X_train, X_test], axis = 0).sort_index()
y_fit = pd.concat([y_train, y_test], axis = 0).sort_index()
X_fit.shape, y_fit.shape

((891, 12), (891,))

In [200]:
n_cross_features = len(great_cross_features) #3
n_cross_features

6

In [210]:
#Make a model
used_cols = []
used_cross_cols = []
current_dfs = []
max_df_train = None
max_df_test = None
current_df_train = None
current_df_test = None

#log_file = open('log.txt', 'w')

iter2 = 0
for iteration in range(n_cross_features):
    
    max_accuracy_train = 0
    max_accuracy_test = 0
    max_accuracy_ft = None
    max_col1, max_col2 = None, None
    
    checked_cols_lst = []
    
    for col1 in great_cross_features[(iteration * 2) : (iteration * 2 + 1)]:
        for col2 in great_cross_features[(iteration * 2 + 1) : (iteration * 2 + 2)]:
                
            checked_cols_lst.append([col1, col2])
            
            iter2 += 1
            
            cross_col_name = '{}*{}'.format(col1, col2)

            tmp_df_train = make_cross_variable(X_fit, bin_col_names, col1, col2)
            tmp_df_test  = make_cross_variable(X_sub,  bin_col_names, col1, col2)
            
            if iteration > 0:
                tmp_df_train = pd.concat([current_dfs[0],  tmp_df_train], axis = 1)
                tmp_df_test  = pd.concat([current_dfs[1],  tmp_df_test], axis = 1)
            
            clf = KNN(9)
            
            
            
            clf.fit(tmp_df_train, y_fit.sort_index())
            y_pred_train = clf.predict(tmp_df_train)
            y_pred_test  = clf.predict(tmp_df_test)

            accuracy_train = clf.score(tmp_df_train, y_fit)
            #accuracy_test  = 200* roc_auc_score(y_test,  y_pred_test) - 100

            print('{}. cross_col_name: {}'.format(iter2, cross_col_name))
            #log_file.write('{}. cross_col_name: {}\n'.format(iter2, cross_col_name))
            print('accuracy_train: {}'.format(accuracy_train), end = '\n\n')
            #log_file.write('accuracy_train: {}\naccuracy_test:  {}\n\n'.format(accuracy_train, accuracy_test))

            if True:
                max_col1 = col1
                max_col2 = col2
                
                max_accuracy_train = accuracy_train
                #max_accuracy_test  = accuracy_test
                
                max_accuracy_ft = cross_col_name
                
                max_df_train = tmp_df_train
                max_df_test = tmp_df_test


    current_df_train = max_df_train.copy()
    current_df_test  = max_df_test.copy()
        
    current_dfs = [current_df_train, current_df_test]
        
    if  max_col1 is None or max_col2 is None:
        break

    used_cols.extend([max_col1, max_col2])
    used_cross_cols.append(max_accuracy_ft)
    
    print('used cols: {}'.format(used_cols), end = '\n----\n\n')

1. cross_col_name: SibSp*Name_prefix
accuracy_train: 0.8114478114478114

used cols: ['SibSp', 'Name_prefix']
----

2. cross_col_name: Sex_female*Pclass
accuracy_train: 0.8002244668911336

used cols: ['SibSp', 'Name_prefix', 'Sex_female', 'Pclass']
----

3. cross_col_name: Sex_male*Age
accuracy_train: 0.8428731762065096

used cols: ['SibSp', 'Name_prefix', 'Sex_female', 'Pclass', 'Sex_male', 'Age']
----



In [211]:
X_sub.shape

(418, 12)

In [212]:
y_pred_test

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [213]:
sub1 = X_sub.loc[:, ['PassengerId']].copy()
sub1['Survived'] = np.round(y_pred_test).astype(np.int64)
sub1.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,0
9,901,0


In [214]:
gender_submission = pd.read_csv('input/gender_submission.csv')
gender_submission.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [215]:
sub1 = pd.merge(gender_submission.loc[:, ['PassengerId']], sub1, on = 'PassengerId', how = 'inner')

In [216]:
sub1.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,0
9,901,0


In [217]:
sub1.to_csv('output/sub1_cross_val_score1_knn8_9_name_prefix.csv', index = None)