In [3]:
#Importing the required Libraries
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [4]:
#Reading train.csv and test.csv files

train = pd.read_csv("/Volumes/Goldmine/Course/ML_beginners/train.csv")
test = pd.read_csv("/Volumes/Goldmine/Course/ML_beginners/test.csv")

In [5]:
#Looking at the different columns and the type of data in it

train.head(10)
train.columns

Index(['User_ID', 'Description', 'Browser_Used', 'Device_Used', 'Is_Response'], dtype='object')

In [6]:
#Defining a function that cleans the reviews so that it can be read easily

stops = set(stopwords.words("english"))
def cleanData(text, lowercase=False, remove_stops=False, stemming=False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [7]:
#Concatenate test and train data, adding Is_Response column in the test data

test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [8]:
#Clean the Description column by calling the CleanData function onit. 
#Map() helps select all the value in the Description Column without using the for loop. 
#Lambda defines a temporary function.

alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

#Note that the Description column now consists of only the key words of the reviews, with the stopwords removed fromthe data

alldata[:5]

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,room kind clean strong smell dog gener averag ...,Edge,Mobile,not happy
1,id10327,stay crown plaza april april staff friendli at...,Internet Explorer,Mobile,not happy
2,id10328,book hotel hotwir lowest price could find got ...,Mozilla,Tablet,not happy
3,id10329,stay husband son way alaska cruis love hotel g...,InternetExplorer,Desktop,happy
4,id10330,girlfriend stay celebr th birthday plan weeken...,Edge,Tablet,not happy


In [9]:
#CountVectorizer counts the frequency of specific words
#Tf-IDF Vectorizer uses weighted methods to study the weight and frequency of a particular word in context

countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)

bagofwords = countvec.fit_transform(alldata['Description'])
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

bagofwords

<68336x500 sparse matrix of type '<class 'numpy.int64'>'
	with 2856714 stored elements in Compressed Sparse Row format>

In [10]:
tfidfdata

<68336x500 sparse matrix of type '<class 'numpy.float64'>'
	with 2856714 stored elements in Compressed Sparse Row format>

In [11]:
#Converts the Browser and Device data into Numeric values

cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    alldata[x] = lbl.fit_transform(alldata[x])

In [12]:
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.140913,0.000000,0.0,0.000000,0.000000,0.298070
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.092802,0.000000,0.122793,0.000000,0.0,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.101180,0.000000,0.0,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.196513,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.083280,0.000000,0.096701,0.146196,0.000000,0.0,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.133998,0.000000,0.088651,0.000000,0.0,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.125338,0.000000,0.000000,0.073343,0.000000,0.0,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.162737,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.201431
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.240847,0.000000,0.000000,0.000000,0.0,...,0.000000,0.202215,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


In [13]:
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [14]:
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

train_feats[:5]

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,room kind clean strong smell dog gener averag ...,1,1,not happy
1,id10327,stay crown plaza april april staff friendli at...,5,1,not happy
2,id10328,book hotel hotwir lowest price could find got ...,7,2,not happy
3,id10329,stay husband son way alaska cruis love hotel g...,6,0,happy
4,id10330,girlfriend stay celebr th birthday plan weeken...,1,2,not happy


In [15]:
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

In [17]:
#Using Naive Bayes model

mod1 = GaussianNB()
target = train_feats['Is_Response']

In [18]:
#Printing the Cross Validation Scores for Naive Bayes on bow_df

print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.77208526  0.76110968  0.76753147  0.76663242  0.77626509]


In [19]:
#Printing the Cross Validation Scores for Naive Bayes on tfid_df


print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.80906523  0.81518109  0.80901618  0.81312612  0.80349345]


In [20]:
#Training the Naive Bayes model with the train data

clf1 = GaussianNB()
clf1.fit(train_feats1, target)

clf2 = GaussianNB()
clf2.fit(train_feats2, target)

GaussianNB(priors=None)

In [21]:
#Prediciton the Is_Response on test data

preds1 = clf1.predict(test_feats1)
preds2 = clf2.predict(test_feats2)

In [22]:
#Converting Binary responses to Happy/ Not Happy responses

def to_labels(x):
    if x==1:
        return "happy"
    return "not_happy"

In [23]:
#Saving the responses into a new csv file

subm1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})
subm1['Is_Response'] = subm1['Is_Response'].map(lambda x: to_labels(x))

subm2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})
subm2['Is_Response'] = subm2['Is_Response'].map(lambda x: to_labels(x))

subm1 = subm1[['User_ID', 'Is_Response']]
subm2 = subm2[['User_ID', 'Is_Response']]

subm1.to_csv('subm1_cv.csv', index=False)
subm2.to_csv('subm2_tf.csv', index=False)