In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("./data/train.csv")

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
from nltk.corpus import stopwords

In [5]:
train.Browser_Used.value_counts(dropna=False)

Firefox              7367
Edge                 7134
Google Chrome        4659
InternetExplorer     4588
Mozilla Firefox      4328
Mozilla              3092
Chrome               2470
IE                   2439
Internet Explorer    2103
Safari                390
Opera                 362
Name: Browser_Used, dtype: int64

In [6]:
#found error in data collection as most browsers with same name has been used again with different ID . we need to merge it.
print("data-type :",type(train.Browser_Used.value_counts()))
browsers = train.Browser_Used.value_counts().index
print("data-type of browsers :",type(browsers))
print("Name of Browsers :")
for i in browsers:
    print(i)

data-type : <class 'pandas.core.series.Series'>
data-type of browsers : <class 'pandas.core.indexes.base.Index'>
Name of Browsers :
Firefox
Edge
Google Chrome
InternetExplorer
Mozilla Firefox
Mozilla
Chrome
IE
Internet Explorer
Safari
Opera


In [7]:
#function to clean the dataframe of reused browser names
train["Browser_Used"] = train["Browser_Used"].str.replace('Mozilla Firefox', 'Firefox')
train["Browser_Used"] = train["Browser_Used"].str.replace('Mozilla','Firefox')
train["Browser_Used"] = train["Browser_Used"].str.replace('Internet Explorer', 'InternetExplorer')
train["Browser_Used"] = train["Browser_Used"].str.replace('IE', 'InternetExplorer')
train["Browser_Used"] = train["Browser_Used"].str.replace('Google Chrome', 'Chrome')
print("browsers managed...........")

browsers managed...........


In [8]:
train.Browser_Used.value_counts(dropna=False)

Firefox             14787
InternetExplorer     9130
Edge                 7134
Chrome               7129
Safari                390
Opera                 362
Name: Browser_Used, dtype: int64

In [9]:
train.Is_Response.value_counts(dropna=False)

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [10]:
train.groupby(['Browser_Used','Is_Response'])['Is_Response'].count()

Browser_Used      Is_Response
Chrome            happy          6138
                  not happy       991
Edge              happy          3218
                  not happy      3916
Firefox           happy          8729
                  not happy      6058
InternetExplorer  happy          7911
                  not happy      1219
Opera             happy           241
                  not happy       121
Safari            happy           284
                  not happy       106
Name: Is_Response, dtype: int64

In [11]:
train.groupby(['Device_Used','Is_Response'])['Is_Response'].count()

Device_Used  Is_Response
Desktop      happy          10595
             not happy       4431
Mobile       happy          10602
             not happy       4374
Tablet       happy           5324
             not happy       3606
Name: Is_Response, dtype: int64

In [12]:
train.groupby(['Browser_Used','Device_Used','Is_Response'])['Is_Response'].count()

Browser_Used      Device_Used  Is_Response
Chrome            Desktop      happy          2436
                               not happy       372
                  Mobile       happy          2450
                               not happy       332
                  Tablet       happy          1252
                               not happy       287
Edge              Desktop      happy          1310
                               not happy      1397
                  Mobile       happy          1243
                               not happy      1394
                  Tablet       happy           665
                               not happy      1125
Firefox           Desktop      happy          3490
                               not happy      2164
                  Mobile       happy          3564
                               not happy      2117
                  Tablet       happy          1675
                               not happy      1777
InternetExplorer  Desktop      happy   

In [13]:
#using only the comments to train the model
df = train.drop(['Device_Used','Browser_Used'],axis=1)
df.head()

Unnamed: 0,User_ID,Description,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,not happy
2,id10328,I booked this hotel through Hotwire at the low...,not happy
3,id10329,Stayed here with husband and sons on the way t...,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,not happy


In [14]:
import nltk
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [15]:
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review,'html.parser').get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return( " ".join( meaningful_words ))

In [16]:
clean_review = review_to_words(train["Description"][0])
print(clean_review)

room kind clean strong smell dogs generally average ok overnight stay fussy would consider staying price right breakfast free better nothing


In [17]:
num_reviews = df["Description"].size
clean_train_reviews = []
for i in range( 0, num_reviews ):
    clean_train_reviews.append(review_to_words(df["Description"][i]))

In [18]:
print("Creating bag of words model")
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = None,
                            max_features = 1000)

train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

Creating bag of words model


In [19]:
print(train_data_features.shape)

(38932, 1000)


In [20]:
vocab = vectorizer.get_feature_names()
print(vocab[0:20])

['able', 'absolutely', 'ac', 'access', 'accommodating', 'across', 'actually', 'added', 'addition', 'additional', 'adequate', 'afternoon', 'ago', 'air', 'airport', 'allowed', 'almost', 'along', 'already', 'also']


In [21]:
import numpy as np
dist = np.sum(train_data_features,axis=0)
for tag,count in zip(vocab,dist):
    print(count,tag)

2612 able
1388 absolutely
627 ac
2568 access
871 accommodating
3302 across
1993 actually
614 added
523 addition
541 additional
1003 adequate
816 afternoon
692 ago
2256 air
3705 airport
602 allowed
1552 almost
1220 along
1093 already
12001 also
3361 although
4586 always
2428 amazing
1988 amenities
551 amount
618 anniversary
4092 another
552 antonio
1733 anyone
2264 anything
798 anyway
1175 anywhere
1078 appointed
679 appreciated
10491 area
1387 areas
6794 around
1489 arrival
3805 arrived
540 art
2196 ask
3960 asked
585 asking
1021 ate
777 atmosphere
600 attention
853 attentive
539 attitude
1302 attractions
3042 available
896 ave
1034 avenue
1527 average
679 avoid
5787 away
1238 awesome
9427 back
3322 bad
577 bag
551 bagels
1296 bags
813 balcony
6304 bar
1022 bars
810 based
988 basic
1583 bath
8992 bathroom
1428 bathrooms
966 bay
932 beach
851 beat
3254 beautiful
11004 bed
649 bedding
1748 bedroom
5625 beds
515 beer
838 behind
1014 believe
739 bell
5329 best
4867 better
724 beyond
4146 b

In [22]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features,df['Is_Response'])

In [23]:
test = pd.read_csv('./data/test.csv')
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


In [24]:
df = test.drop(['Browser_Used','Device_Used'],axis=1)
df.head()

Unnamed: 0,User_ID,Description
0,id80132,Looking for a motel in close proximity to TV t...
1,id80133,Walking distance to Madison Square Garden and ...
2,id80134,Visited Seattle on business. Spent - nights in...
3,id80135,This hotel location is excellent and the rooms...
4,id80136,This hotel is awesome I love the service Antho...


In [25]:
num_reviews = len(test.Description)
clean_test_reviews = []

for i in range(0,num_reviews):
    clean_review = review_to_words( test["Description"][i] )
    clean_test_reviews.append(clean_review)
    
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

result = forest.predict(test_data_features)

In [28]:
output1 = test.drop(['Description','Browser_Used','Device_Used'],axis=1)
output1.head()

Unnamed: 0,User_ID
0,id80132
1,id80133
2,id80134
3,id80135
4,id80136


In [29]:
output1['Is_Response'] = pd.Series(result)
output1.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not happy
1,id80133,happy
2,id80134,happy
3,id80135,happy
4,id80136,happy


In [32]:
output1.to_csv('./output/submit.csv',index=False,quoting=3)