In [66]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


In [67]:
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)

## Filtering column "mail_type"
train_x = train_df[['org', 'tld', 'ccs', 'bcced', 'mail_type', 'images', 'urls', 'salutations', 'designation', 'chars_in_subject', 'chars_in_body']]
test_x = test_df[['org', 'tld', 'ccs', 'bcced', 'mail_type', 'images', 'urls', 'salutations', 'designation', 'chars_in_subject', 'chars_in_body']]

train_y = train_df[['label']]

train_x[['org', 'tld', 'mail_type']] = train_x[['org', 'tld', 'mail_type']].fillna(value='None')
test_x[['org', 'tld', 'mail_type']] = test_x[['org', 'tld', 'mail_type']].fillna(value='None')

train_x = train_x.fillna(value='0')
test_x = test_x.fillna(value='0')

train_x[['chars_in_subject']] = train_x[['chars_in_subject']].astype('int')
test_x[['chars_in_subject']] = test_x[['chars_in_subject']].astype('int')






In [68]:
train_x.describe()

Unnamed: 0,ccs,bcced,images,urls,salutations,designation,chars_in_subject,chars_in_body
count,80176.0,80176.0,80176.0,80176.0,80176.0,80176.0,80176.0,80176.0
mean,0.473259,0.002869,7.875998,34.433172,0.406381,0.104246,51.52497,197178.4
std,3.178805,0.053484,305.444093,97.904682,0.49116,0.305581,33.722776,1890130.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
25%,0.0,0.0,0.0,4.0,0.0,0.0,32.0,4651.0
50%,0.0,0.0,2.0,17.0,0.0,0.0,42.0,19999.5
75%,0.0,0.0,8.0,44.0,1.0,0.0,62.0,45193.0
max,179.0,1.0,83481.0,21540.0,1.0,1.0,606.0,74381080.0


In [69]:
train_x.head()

Unnamed: 0,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body
0,reply,ebay.in,0,0,multipart/alternative,35,120,0,0,49,80027
1,edm,efinmail.com,0,0,multipart/alternative,1,7,0,0,107,2961
2,usebackpack,com,0,0,text/html,4,17,0,0,35,25149
3,granular,ai,0,0,multipart/mixed,0,0,0,0,15,635296
4,github,com,1,0,multipart/alternative,2,11,0,0,49,2355


In [70]:
## Do one hot encoding of categorical feature
feat_enc = OneHotEncoder()

# Get list of categorical variables
s = (train_x.dtypes == 'object')
object_cols = list(s[s].index) 
print(object_cols)

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_x[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(test_x[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = train_x.index
OH_cols_valid.index = test_x.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_x.drop(object_cols, axis=1)
num_X_valid = test_x.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

OH_X_train.head()

['org', 'tld', 'mail_type']


Unnamed: 0,ccs,bcced,images,urls,salutations,designation,chars_in_subject,chars_in_body,0,1,...,1335,1336,1337,1338,1339,1340,1341,1342,1343,1344
0,0,0,35,120,0,0,49,80027,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,7,0,0,107,2961,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,4,17,0,0,35,25149,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,0,0,0,0,0,15,635296,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,2,11,0,0,49,2355,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
OH_X_train.to_csv('train_modified.csv')

In [None]:
## Train a simple KNN classifier using featurized data
model = DecisionTreeRegressor(random_state=1)
model.fit(OH_X_train, train_y)
pred_y = model.predict(OH_X_valid)

In [None]:
## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['label']).astype('int')
pred_df.to_csv("knn_sample_submission.csv", index=True, index_label='Id')

In [None]:
pred_train = model.predict(OH_X_train)
pred_train = pd.DataFrame(pred_train, columns=['label']).astype('int')
comp = pred_train == train_y

#mean_absolute_error(train_y, pred_train)

In [None]:
train_y

In [None]:
pred_df