In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 


In [2]:
pd.options.mode.chained_assignment = None 

train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)

## Filtering column "mail_type"
trainVal_x = train_df[['org', 'tld', 'ccs', 'bcced', 'mail_type', 'images', 'urls', 'salutations', 'designation', 'chars_in_subject', 'chars_in_body']]
test_x = test_df[['org', 'tld', 'ccs', 'bcced', 'mail_type', 'images', 'urls', 'salutations', 'designation', 'chars_in_subject', 'chars_in_body']]

trainVal_y = train_df[['label']]

trainVal_x[['org', 'tld', 'mail_type']] = trainVal_x[['org', 'tld', 'mail_type']].fillna(value='None')
test_x[['org', 'tld', 'mail_type']] = test_x[['org', 'tld', 'mail_type']].fillna(value='None')

trainVal_x = trainVal_x.fillna(value='0')
test_x = test_x.fillna(value='0')

trainVal_x[['chars_in_subject']] = trainVal_x[['chars_in_subject']].astype('int')
test_x[['chars_in_subject']] = test_x[['chars_in_subject']].astype('int')






In [3]:
trainVal_x.describe()

Unnamed: 0,ccs,bcced,images,urls,salutations,designation,chars_in_subject,chars_in_body
count,80176.0,80176.0,80176.0,80176.0,80176.0,80176.0,80176.0,80176.0
mean,0.473259,0.002869,7.875998,34.433172,0.406381,0.104246,51.52497,197178.4
std,3.178805,0.053484,305.444093,97.904682,0.49116,0.305581,33.722776,1890130.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
25%,0.0,0.0,0.0,4.0,0.0,0.0,32.0,4651.0
50%,0.0,0.0,2.0,17.0,0.0,0.0,42.0,19999.5
75%,0.0,0.0,8.0,44.0,1.0,0.0,62.0,45193.0
max,179.0,1.0,83481.0,21540.0,1.0,1.0,606.0,74381080.0


In [4]:
trainVal_x.head()

Unnamed: 0,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body
0,reply,ebay.in,0,0,multipart/alternative,35,120,0,0,49,80027
1,edm,efinmail.com,0,0,multipart/alternative,1,7,0,0,107,2961
2,usebackpack,com,0,0,text/html,4,17,0,0,35,25149
3,granular,ai,0,0,multipart/mixed,0,0,0,0,15,635296
4,github,com,1,0,multipart/alternative,2,11,0,0,49,2355


In [5]:
## Do one hot encoding of categorical feature
feat_enc = OneHotEncoder()

# Get list of categorical variables
s = (trainVal_x.dtypes == 'object')
object_cols = list(s[s].index) 
print(object_cols)

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(trainVal_x[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(test_x[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = trainVal_x.index
OH_cols_valid.index = test_x.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = trainVal_x.drop(object_cols, axis=1)
num_X_valid = test_x.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

OH_X_train.head()

['org', 'tld', 'mail_type']


Unnamed: 0,ccs,bcced,images,urls,salutations,designation,chars_in_subject,chars_in_body,0,1,...,1335,1336,1337,1338,1339,1340,1341,1342,1343,1344
0,0,0,35,120,0,0,49,80027,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,7,0,0,107,2961,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,4,17,0,0,35,25149,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,0,0,0,0,0,15,635296,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,2,11,0,0,49,2355,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
train_x, val_x, train_y, val_y = train_test_split(OH_X_train, trainVal_y, random_state = 0)

In [7]:
#OH_X_train.to_csv('train_modified.csv')

In [8]:
## Train a simple decision tree classifier using featurized data
model = DecisionTreeRegressor(random_state=1)
model.fit(train_x, train_y)

DecisionTreeRegressor(random_state=1)

In [9]:
## Let's estimate our erreur with the validation set

pred_val = model.predict(val_x)
pred_val = pd.DataFrame(pred_val, columns=['label']).astype('int')

comp = pred_val.values == val_y.values

## Ratio of correct prediction over the training set:
np.mean(comp.astype('int'))

0.39912193175014965

We only classify correctly 40% of the data, which isn't that good.
Maybe more complex models like radom forest will help us get better results.

In [15]:
## I re-train the model with the entire available data.
model.fit(OH_X_train, trainVal_y)


## Make predictions
pred_y = model.predict(OH_X_valid)


## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['label']).astype('int')
pred_df.to_csv("DT_sample_submission.csv", index=True, index_label='Id')

In [11]:
model.get_depth()

156