In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, manifold
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

%matplotlib inline

np.random.seed(8888)



# Feature building
This notebook is for taking the train and test data and building features from it to be fed to models.

### Basic features

In [2]:
train_dat = pd.read_csv('train.csv')
test_dat = pd.read_csv('test.csv')

train_labels = train_dat.QuoteConversion_Flag.values
train_dat = train_dat.drop('QuoteConversion_Flag', axis=1)

train_dat = train_dat.fillna(-999)
test_dat = test_dat.fillna(-999)

# Date features
train_dat['Original_Quote_Date'] = pd.to_datetime(pd.Series(train_dat['Original_Quote_Date']))
test_dat['Original_Quote_Date'] = pd.to_datetime(pd.Series(test_dat['Original_Quote_Date']))

train_dat['Year'] = train_dat['Original_Quote_Date'].apply(lambda x: x.year)
test_dat['Year'] = test_dat['Original_Quote_Date'].apply(lambda x: x.year)

train_dat['Month'] = train_dat['Original_Quote_Date'].apply(lambda x: x.month)
test_dat['Month'] = test_dat['Original_Quote_Date'].apply(lambda x: x.month)

#train_dat['Week'] = train_dat['Original_Quote_Date'].apply(lambda x: x.week)
#test_dat['Week'] = test_dat['Original_Quote_Date'].apply(lambda x: x.week)

train_dat['Weekday'] = train_dat['Original_Quote_Date'].apply(lambda x: x.dayofweek)
test_dat['Weekday'] = test_dat['Original_Quote_Date'].apply(lambda x: x.dayofweek)


# Label encoding 
for c in train_dat.columns:
    if train_dat[c].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_dat[c].values) + list(test_dat[c].values))
        train_dat[c] = lbl.transform(list(train_dat[c].values))
        test_dat[c] = lbl.transform(list(test_dat[c].values))
        
train_dat = train_dat.drop('QuoteNumber',axis=1).drop('Original_Quote_Date',axis=1)
test_dat = test_dat.drop('QuoteNumber',axis=1).drop('Original_Quote_Date',axis=1)

train_dat.to_csv('features/basic_train.csv', index=False)
test_dat.to_csv('features/basic_test.csv', index=False)

np.save('features/train_labels.npy', train_labels)

In [3]:
train_dat = pd.read_csv('features/basic_train.csv')
test_dat = pd.read_csv('features/basic_test.csv')

train_labels = np.load('features/train_labels.npy')

In [4]:
train_dat.head()

Unnamed: 0,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,CoverageField1B,CoverageField2A,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,Weekday
0,1,23,0.9403,0.0006,7,1.02,0,17,23,17,...,8,-1,18,-1,10,1,0,2013,8,4
1,5,7,1.0006,0.004,4,1.2433,0,6,8,6,...,11,-1,17,-1,20,1,2,2014,4,1
2,5,7,1.0006,0.004,4,1.2433,0,7,12,7,...,21,-1,11,-1,8,1,2,2014,8,0
3,6,10,0.9769,0.0004,1,1.2665,0,3,2,3,...,10,-1,9,-1,21,1,3,2013,4,0
4,4,23,0.9472,0.0006,3,1.3045,0,8,13,8,...,10,-1,11,-1,12,1,1,2014,1,5


## Add columns with the number of -1s and zeros

In [5]:
train_dat['NumZero'] = np.sum(train_dat==0, axis=1)
train_dat['NumMinusOne'] = np.sum(train_dat==-1, axis=1)

test_dat['NumZero'] = np.sum(test_dat==0, axis=1)
test_dat['NumMinusOne'] = np.sum(test_dat==-1, axis=1)

train_dat.to_csv('features/zeros_ones_train.csv', index=False)
test_dat.to_csv('features/zeros_ones_test.csv', index=False)

In [6]:
train_dat = pd.read_csv('features/zeros_ones_train.csv')
test_dat = pd.read_csv('features/zeros_ones_test.csv')

train_labels = np.load('features/train_labels.npy')

## Sum accross each category

In [7]:
general = []
coverage = []
sales = []
personal = []
prop = []
geographic = []

for feature in train_dat.columns:
    if feature.startswith('Field'): general.append(feature)
    elif feature.startswith('Coverage'): coverage.append(feature)
    elif feature.startswith('Sales'): sales.append(feature)
    elif feature.startswith('Personal'): personal.append(feature)
    elif feature.startswith('Property'): prop.append(feature)
    elif feature.startswith('Geographic'): geographic.append(feature)
        
train_dat['GeneralSum'] = train_dat[general].sum(axis=1)
train_dat['CoverageSum'] = train_dat[coverage].sum(axis=1)
train_dat['SalesSum'] = train_dat[sales].sum(axis=1)
train_dat['PersonalSum'] = train_dat[personal].sum(axis=1)
train_dat['PropertySum'] = train_dat[prop].sum(axis=1)
train_dat['GeographicSum'] = train_dat[geographic].sum(axis=1)

        
test_dat['GeneralSum'] = test_dat[general].sum(axis=1)
test_dat['CoverageSum'] = test_dat[coverage].sum(axis=1)
test_dat['SalesSum'] = test_dat[sales].sum(axis=1)
test_dat['PersonalSum'] = test_dat[personal].sum(axis=1)
test_dat['PropertySum'] = test_dat[prop].sum(axis=1)
test_dat['GeographicSum'] = test_dat[geographic].sum(axis=1)


train_dat.to_csv('features/cat_sums_train.csv', index=False)
test_dat.to_csv('features/cat_sums_test.csv', index=False)

In [8]:
train_dat = pd.read_csv('features/cat_sums_train.csv')
test_dat = pd.read_csv('features/cat_sums_test.csv')

train_labels = np.load('features/train_labels.npy')

## Trim unimportant features

In [None]:
bst = xgb.Booster()
bst.load_model('cat_sums_base.model')

In [None]:
cols = sorted(bst.get_fscore().items(), key=lambda x: x[1],reverse=True)
# Transforms feature names (f0, f1, ..., f276) to column names (SalesField8, ...)
scores = [ j for i,j in cols ]
cols = [ train_dat.columns[int(i.replace('f',''))] for i,j in cols ]
plt.figure(figsize=(15,45))
sns.barplot(scores, cols, orient='h')
plt.xticks(rotation=90)
plt.show()

In [None]:
trimmed_cols = cols[0:-20]
train_dat[trimmed_cols].to_csv('features/trimmed_train.csv', index=False)
test_dat[trimmed_cols].to_csv('features/trimmed_test.csv', index=False)

In [None]:
len(test_dat)

In [None]:
samp = int(len(train_dat)*0.95)
print(len(train_dat),'total samples')
print(samp,'used for training')
xg_train = xgb.DMatrix( train_eng.iloc[:samp].as_matrix(), label=train_labels[:samp])
xg_test = xgb.DMatrix( train_eng.iloc[samp:].as_matrix(), label=train_labels[samp:])

In [None]:
param = {   'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'nthread':4,
            'max_depth':6,
            'colsample_bytree':0.75,
            'subsample':0.83,
            'min_child_weight':5,
            'eta':0.025,
        }
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 10000
bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=100)
bst.save_model('pure_{}rounds.model'.format(num_round))

In [None]:
xg_pred = xgb.DMatrix( test_dat.as_matrix())

pred = bst.predict(xg_pred)
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = pred
sample.to_csv('xgb_new_01.csv', index=False)