<img src="SENTIMENT-09-1.png" width="990" height="400">

# Import functions and data

In [1]:
import pandas as pd 
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('HotelData.csv')

In [3]:
data.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


# INFO about the data

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38932 entries, 0 to 38931
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   User_ID       38932 non-null  object
 1   Description   38932 non-null  object
 2   Browser_Used  38932 non-null  object
 3   Device_Used   38932 non-null  object
 4   Is_Response   38932 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


# Check null values present or not in the data set

In [5]:
data.isnull().sum()

User_ID         0
Description     0
Browser_Used    0
Device_Used     0
Is_Response     0
dtype: int64

# Droping the unwanted Features

In [6]:
data.drop(columns = ['User_ID', 'Browser_Used', 'Device_Used'], inplace = True)

In [7]:
data.head()

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy


# Process the Review 

In [8]:
def process_Review(text):
    '''
    Input:
        text: a string containing a hotel review
    Output:
        text: words containing the processed text
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
   
    return text

In [9]:
data['cleaned_description'] = pd.DataFrame(data.Description.apply(process_Review))
data.head(10)

Unnamed: 0,Description,Is_Response,cleaned_description
0,The room was kind of clean but had a VERY stro...,not happy,the room was kind of clean but had a very stro...
1,I stayed at the Crown Plaza April -- - April -...,not happy,i stayed at the crown plaza april april th...
2,I booked this hotel through Hotwire at the low...,not happy,i booked this hotel through hotwire at the low...
3,Stayed here with husband and sons on the way t...,happy,stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...,not happy,my girlfriends and i stayed here to celebrate ...
5,We had - rooms. One was very nice and clearly ...,happy,we had rooms one was very nice and clearly ha...
6,My husband and I have stayed in this hotel a f...,not happy,my husband and i have stayed in this hotel a f...
7,My wife & I stayed in this glorious city a whi...,happy,my wife i stayed in this glorious city a whil...
8,My boyfriend and I stayed at the Fairmont on a...,happy,my boyfriend and i stayed at the fairmont on a...
9,"Wonderful staff, great location, but it was de...",not happy,wonderful staff great location but it was defi...


# Independent and Dependent Variables

In [10]:
X = data.cleaned_description#Independent_variable 
y = data.Is_Response#Dependent_variable

# Split the data {Train and Test}

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 225)
print('X_train :', len(X_train))
print('X_test  :', len(X_test))
print('y_train :', len(y_train))
print('y_test  :', len(y_test))

X_train : 35038
X_test  : 3894
y_train : 35038
y_test  : 3894


# We use LogisticRegression for our model 
# Apply TfidfVectorizer to our data 

# Purpose of TFIDF is ti highlight words which are frequent in the document or line etc

In [12]:
tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")

# Pipeline is created because to exicute it line by line 

In [13]:
model = Pipeline([('vectorizer',tvec),('classifier',clf2)])

In [14]:
model.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [21]:
predictions = model.predict(X_test)

# confusion_matrix

In [22]:
confusion_matrix(predictions, y_test)

array([[2418,  306],
       [ 153, 1017]], dtype=int64)

# Accuracy of our model

In [24]:
print("Accuracy : ", accuracy_score(predictions, y_test))

Accuracy :  0.8821263482280431


# Precision of our model

In [25]:
print("Precision : ", precision_score(predictions, y_test, average = 'weighted'))

Precision :  0.8888758956340842


# Recall of our model

In [26]:
print("Recall : ", recall_score(predictions, y_test, average = 'weighted'))

Recall :  0.8821263482280431


# Predict with your own Review

In [27]:
example = ["i am not happy about this hotel"]
result = model.predict(example)
print(result)

['not happy']


In [28]:
import pickle
filename = 'ModelAI'
outfile = open(filename,'wb')
pickle.dump(model,outfile)
outfile.close()