### Introduction

In [1]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [2]:
# load data
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [3]:
cnn_train_df = pd.read_csv("cat_countvec_train.csv")
cnn_test_df = pd.read_csv("cat_countvec_test.csv")

In [4]:
cnn2_train_df = pd.read_csv("cnn_train.csv")
cnn2_test_df = pd.read_csv("cnn_test.csv")

In [5]:
train_df = pd.merge(train_df, cnn_train_df, on='User_ID', how='left')
test_df = pd.merge(test_df, cnn_test_df, on='User_ID', how='left')

In [6]:
train_df = pd.merge(train_df, cnn2_train_df, on='User_ID', how='left')
test_df = pd.merge(test_df, cnn2_test_df, on='User_ID', how='left')

In [7]:
train_df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,happy,not_happy,cnn_target_1,cnn_target_2
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,0.044895,0.955105,0.206745,0.793256
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,0.226909,0.773091,0.691456,0.308544
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,0.035158,0.964842,0.031447,0.968553
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,0.995797,0.004203,0.999429,0.000571
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,0.028164,0.971836,0.675902,0.324098


In [8]:
stopwords_set = set(stopwords.words("english"))

In [9]:
## Number of words in the text ##
train_df["num_words"] = train_df["Description"].apply(lambda x: len(str(x).split()))
test_df["num_words"] = test_df["Description"].apply(lambda x: len(str(x).split()))

In [10]:
## Number of unique words in the text ##
train_df["num_unique_words"] = train_df["Description"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words"] = test_df["Description"].apply(lambda x: len(set(str(x).split())))

In [11]:
## Number of characters in the text ##
train_df["num_chars"] = train_df["Description"].apply(lambda x: len(str(x)))
test_df["num_chars"] = test_df["Description"].apply(lambda x: len(str(x)))

In [12]:
## Number of stopwords in the text ##
train_df["num_stopwords"] = train_df["Description"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords_set]))
test_df["num_stopwords"] = test_df["Description"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords_set]))

In [13]:
import string

train_df["num_punctuations_space"] = train_df['Description'].apply(lambda x: x.count(' ') )
test_df["num_punctuations_space"] = test_df['Description'].apply(lambda x: x.count(' ') )

for c in ['I', 'f']:
    train_df["num_char_{}".format(c)] = train_df['Description'].apply(lambda x: x.count(c) ) / train_df["num_chars"]
    test_df["num_char_{}".format(c)] = test_df['Description'].apply(lambda x: x.count(c) ) / test_df["num_chars"]

for c in ['!']:
    train_df["num_char_{}".format(c)] = train_df['Description'].apply(lambda x: x.count(c) ) / train_df["num_chars"]
    test_df["num_char_{}".format(c)] = test_df['Description'].apply(lambda x: x.count(c) ) / test_df["num_chars"]

In [14]:
train_df["num_stopwords_factor"] = train_df["num_stopwords"] / train_df["num_words"]
test_df["num_stopwords_factor"] = test_df["num_unique_words"] / test_df["num_words"]

In [15]:
train_df.tail()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,happy,not_happy,cnn_target_1,cnn_target_2,num_words,num_unique_words,num_chars,num_stopwords,num_punctuations_space,num_char_I,num_char_f,num_char_!,num_stopwords_factor
38927,id49253,We arrived late at night and walked in to a ch...,Edge,Desktop,happy,0.580977,0.419023,0.972147,0.027853,140,94,769,72,138,0.003901,0.022107,0.0,0.514286
38928,id49254,The only positive impression is location and p...,InternetExplorer,Mobile,not happy,0.644477,0.355523,0.07008,0.92992,90,77,533,36,88,0.001876,0.013133,0.0,0.4
38929,id49255,Traveling with friends for shopping and a show...,Firefox,Mobile,not happy,0.024995,0.975005,0.014769,0.985231,207,127,1065,101,206,0.001878,0.010329,0.0,0.487923
38930,id49256,The experience was just ok. We paid extra for ...,Chrome,Desktop,not happy,0.300177,0.699823,0.23079,0.76921,102,70,519,54,101,0.001927,0.017341,0.0,0.529412
38931,id49257,The Westin is a wonderfully restored grande da...,Mozilla,Desktop,happy,0.903681,0.096319,0.938836,0.061164,20,19,115,9,19,0.0,0.017391,0.0,0.45


In [16]:
## join data
test_df['Is_Response'] = np.nan
alldata = pd.concat([train_df, test_df]).reset_index(drop=True)

In [17]:
# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [18]:
### set target variable
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_df['Is_Response_idx'] = le.fit_transform(train_df['Is_Response'])

In [19]:
le.classes_

array(['happy', 'not happy'], dtype=object)

In [20]:
features = train_df.columns
features = features.drop(['User_ID', 'Description', 'Browser_Used', 'Device_Used', 'Is_Response', 'Is_Response_idx'])
features

Index(['happy', 'not_happy', 'cnn_target_1', 'cnn_target_2', 'num_words',
       'num_unique_words', 'num_chars', 'num_stopwords',
       'num_punctuations_space', 'num_char_I', 'num_char_f', 'num_char_!',
       'num_stopwords_factor'],
      dtype='object')

In [21]:
from sklearn.cross_validation import train_test_split

train_X, valid_X, train_y, valid_y = train_test_split(train_df[features].as_matrix(), 
                                                      train_df['Is_Response_idx'].as_matrix(),
                                                      test_size=0.2,
                                                      random_state=42)

print(train_X.shape)
print(valid_X.shape)                                           
print(train_y.shape)
print(valid_y.shape)

(31145, 13)
(7787, 13)
(31145,)
(7787,)




### CatBoost

Catboost is a new package recently launched by Yandex. It is said that it works well when the data has many categorical features. We'll use it on count data and see it our model improves.

In [22]:
## import library
from catboost import CatBoostClassifier,cv, Pool

In [23]:
## define the classifer model
model = CatBoostClassifier(
    learning_rate=0.03,
    rsm = 0.8,
    depth=8,
    iterations=1000,
    od_type = 'Iter',
    od_wait = 10,
    eval_metric='MultiClass',
    loss_function='MultiClass',
    verbose=True)

In [24]:
train_df['Is_Response_idx'].value_counts(normalize=True)

0    0.681213
1    0.318787
Name: Is_Response_idx, dtype: float64

In [25]:
## train model
model.fit(train_X, train_y, eval_set=(valid_X, valid_y))

Borders for float features generated
0:	learn -0.6684038356	test -0.6705939148	bestTest -0.6705939148		total: 300ms	remaining: 5m
1:	learn -0.6446595915	test -0.649068948	bestTest -0.649068948		total: 556ms	remaining: 4m 37s
2:	learn -0.6224987217	test -0.6288363467	bestTest -0.6288363467		total: 789ms	remaining: 4m 22s
3:	learn -0.6018328939	test -0.6099485641	bestTest -0.6099485641		total: 1.03s	remaining: 4m 17s
4:	learn -0.5817629249	test -0.5918627825	bestTest -0.5918627825		total: 1.26s	remaining: 4m 9s
5:	learn -0.5626500693	test -0.5745381608	bestTest -0.5745381608		total: 1.48s	remaining: 4m 4s
6:	learn -0.5446423902	test -0.5582368101	bestTest -0.5582368101		total: 1.71s	remaining: 4m 2s
7:	learn -0.5274036223	test -0.542693474	bestTest -0.542693474		total: 1.93s	remaining: 3m 59s
8:	learn -0.510949066	test -0.5278331906	bestTest -0.5278331906		total: 2.18s	remaining: 3m 59s
9:	learn -0.4953893102	test -0.5138345463	bestTest -0.5138345463		total: 2.41s	remaining: 3m 58s
10:	l

84:	learn -0.1393818762	test -0.2170459355	bestTest -0.2170459355		total: 20.4s	remaining: 3m 39s
85:	learn -0.1383829809	test -0.2164640196	bestTest -0.2164640196		total: 20.6s	remaining: 3m 38s
86:	learn -0.13746902	test -0.2159271333	bestTest -0.2159271333		total: 20.8s	remaining: 3m 38s
87:	learn -0.1366702426	test -0.2153607284	bestTest -0.2153607284		total: 21.1s	remaining: 3m 38s
88:	learn -0.1358734035	test -0.2150077947	bestTest -0.2150077947		total: 21.4s	remaining: 3m 39s
89:	learn -0.1350334043	test -0.2146484951	bestTest -0.2146484951		total: 21.7s	remaining: 3m 39s
90:	learn -0.134162949	test -0.2140557018	bestTest -0.2140557018		total: 21.9s	remaining: 3m 38s
91:	learn -0.1334446532	test -0.2139785103	bestTest -0.2139785103		total: 22.2s	remaining: 3m 38s
92:	learn -0.132604074	test -0.2135955918	bestTest -0.2135955918		total: 22.4s	remaining: 3m 38s
93:	learn -0.1318520934	test -0.2132104267	bestTest -0.2132104267		total: 22.6s	remaining: 3m 38s
94:	learn -0.1311109141	

<catboost.core.CatBoostClassifier at 0x7fab476d5668>

In [26]:
feat_importances = model.get_feature_importance(valid_X, valid_y)
feat_df = pd.DataFrame({'features':features, 'importances':feat_importances})
feat_df.sort_values(by='importances')

Unnamed: 0,features,importances
11,num_char_!,0.409962
12,num_stopwords_factor,1.113114
9,num_char_I,1.129818
10,num_char_f,1.13502
6,num_chars,1.532528
5,num_unique_words,1.624964
4,num_words,1.960746
7,num_stopwords,2.283547
8,num_punctuations_space,2.379077
3,cnn_target_2,14.391201


In [27]:
y_pred = model.predict(train_df[features])

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(train_df['Is_Response_idx'].as_matrix(), y_pred)
# 0.93139319839720536
# 0.93157299907531077
# 0.95204459056817015

0.95201890475701223

In [29]:
## make predictions
preds = model.predict(test_df[features])
preds.flatten()

array([ 1.,  0.,  1., ...,  0.,  0.,  0.])

In [30]:
feat_df = pd.DataFrame()
feat_df['preds'] = preds.flatten()
feat_df['preds'].value_counts(normalize=True)

0.0    0.69681
1.0    0.30319
Name: preds, dtype: float64

In [31]:
test_predict = model.predict(test_df[features])

In [32]:
## make submission
sub5 = pd.DataFrame()
sub5['User_ID'] = test_df['User_ID']
sub5['Is_Response'] = test_predict
sub5['Is_Response'] = ['happy' if x < 0.499 else 'not_happy' for x in sub5['Is_Response']]

In [33]:
sub5

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,not_happy
3,id80135,not_happy
4,id80136,happy
5,id80137,happy
6,id80138,happy
7,id80139,not_happy
8,id80140,happy
9,id80141,happy


In [34]:
sub5.to_csv('submission.csv', index=False)