In [1]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer
from IPython.display import HTML
import base64
def create_download_link( df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [2]:
# The code was removed by DSX for sharing.

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


In [5]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [6]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [7]:
# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [8]:
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)

In [9]:
# create features
bagofwords = countvec.fit_transform(alldata['Description'])
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

In [10]:
# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    alldata[x] = lbl.fit_transform(alldata[x])

In [11]:
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [12]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [13]:
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [14]:
# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [15]:
### set target variable

train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [16]:
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [17]:
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

### LightGBM - 1
We are prefering lightgbm over xgboost because of its speed. In this model, we'll use count features for model training.

In [18]:
import lightgbm as lgb

In [20]:
# set the data in format lgb accepts
target = train_feats['Is_Response']
d_train = lgb.Dataset(train_feats1, label = target)

In [21]:
## set parameters
## you can tune the parameters can try to better score

params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 7, 
    'num_leaves': 21, 
    'feature_fraction': 0.3, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5}

In [22]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0.201428 + 0.00352956
[40]	cv_agg's binary_error: 0.182806 + 0.00412602
[60]	cv_agg's binary_error: 0.169398 + 0.00377292
[80]	cv_agg's binary_error: 0.161204 + 0.00392051
[100]	cv_agg's binary_error: 0.153293 + 0.00435082
[120]	cv_agg's binary_error: 0.148926 + 0.00346682
[140]	cv_agg's binary_error: 0.144945 + 0.00276935
[160]	cv_agg's binary_error: 0.141169 + 0.00268201
[180]	cv_agg's binary_error: 0.138164 + 0.00235008
[200]	cv_agg's binary_error: 0.136314 + 0.00286455
[220]	cv_agg's binary_error: 0.135158 + 0.00272845
[240]	cv_agg's binary_error: 0.133335 + 0.00271133
[260]	cv_agg's binary_error: 0.132513 + 0.00308742
[280]	cv_agg's binary_error: 0.131717 + 0.00342503
[300]	cv_agg's binary_error: 0.131049 + 0.00325847
[320]	cv_agg's binary_error: 0.13015 + 0.00330351
[340]	cv_agg's binary_error: 0.129353 + 0.00308661
[360]	cv_agg's binary_error: 0.128711 + 0.00314491
[380]	cv_agg's binary_error: 0.127992 + 0.00331684
[400]	cv_agg's binary_error: 0.12742

In [23]:
## get nround value which hd lowest error
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))

In [24]:
## train the model
model = lgb.train(params, d_train, num_boost_round=nround)

In [25]:
## make predictions
preds = model.predict(test_feats1)

In [26]:
# make submission

def to_labels(x):
    if x > 0.66:  # cutoff - you can change it and see if accuracy improves or plot AUC curve. 
        return "happy"
    return "not_happy"

sub3 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub3['Is_Response'] = sub3['Is_Response'].map(lambda x: to_labels(x))
sub3 = sub3[['User_ID','Is_Response']]
sub3.to_csv('sub3_lgb.csv', index=False) # 0.85518

### LightGBM - 2
In this model, we'll use tf-idf features for model training.

In [27]:
# set data format
d_train = lgb.Dataset(train_feats2, label = target)

In [28]:
# same parameters as above
params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 5, 
    'num_leaves': 11,
    'feature_fraction': 0.3, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5}

In [29]:
## do cross validation to find nround i.e. at this round (iteration) we can expect lowest error
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0.213963 + 0.00558981
[40]	cv_agg's binary_error: 0.195135 + 0.00447732
[60]	cv_agg's binary_error: 0.183962 + 0.00418526
[80]	cv_agg's binary_error: 0.173096 + 0.00356095
[100]	cv_agg's binary_error: 0.165699 + 0.00379223
[120]	cv_agg's binary_error: 0.158712 + 0.00354285
[140]	cv_agg's binary_error: 0.153395 + 0.0037968
[160]	cv_agg's binary_error: 0.14944 + 0.00413244
[180]	cv_agg's binary_error: 0.146101 + 0.00384592
[200]	cv_agg's binary_error: 0.143147 + 0.0045488
[220]	cv_agg's binary_error: 0.141683 + 0.00415373
[240]	cv_agg's binary_error: 0.139756 + 0.00407407
[260]	cv_agg's binary_error: 0.13801 + 0.00354434
[280]	cv_agg's binary_error: 0.136597 + 0.00282834
[300]	cv_agg's binary_error: 0.135955 + 0.00247294
[320]	cv_agg's binary_error: 0.134748 + 0.00266095
[340]	cv_agg's binary_error: 0.133232 + 0.00264697
[360]	cv_agg's binary_error: 0.132384 + 0.00294474
[380]	cv_agg's binary_error: 0.13223 + 0.00240917
[400]	cv_agg's binary_error: 0.131152 + 

In [30]:
# get nround value
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))

In [31]:
# train model
model = lgb.train(params, d_train, num_boost_round=nround)

In [32]:
# make prediction
preds = model.predict(test_feats2)

In [33]:
# make submission

def to_labels(x):
    if x > 0.66:
        return "happy"
    return "not_happy"

sub4 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub4['Is_Response'] = sub4['Is_Response'].map(lambda x: to_labels(x))
sub4 = sub4[['User_ID','Is_Response']]
sub4.to_csv('sub4_lgb.csv', index=False) # 0.84925

In [36]:
create_download_link(sub4)