In [1]:
import pandas as pd             #imports the pandas library and renames it as "pd".
import matplotlib.pyplot as plt #imports the matplotlib library and renames the "pyplot" module as "plt"
import seaborn as sns           #imports the seaborn library, which is a data visualization library that provides an interface for creating attractive and informative statistical graphics.
import re                       #regular expression library that provides powerful tools for pattern matching and string manipulation.

In [2]:
import nltk                    #nltk library provides tools and resources for working with human language data.
#nltk.download()                #This function opens a GUI (graphical user interface) that allows the user to select which data and resources they want to download from the nltk library.
from nltk.corpus import stopwords  # imports the stopwords corpus from the nltk library. The stopwords corpus is a collection of common stopwords for different languages that can be used to remove these words from text data.
from nltk.stem import WordNetLemmatizer  # Lemmatization is the process of reducing a word to its base or root form, which can be useful for reducing the number of unique words in a text corpus. 

In [3]:
from sklearn.feature_extraction.text import CountVectorizer #CountVectorizer is a method for converting text data into a matrix of token counts, which is a common way of representing text data in machine learning applications.
from sklearn.model_selection import GridSearchCV            #GridSearchCV is a method for tuning hyperparameters of a machine learning model using a grid search over a specified parameter space.
from sklearn.ensemble import RandomForestClassifier         #imports the RandomForestClassifier class from the sklearn.ensemble module. RandomForestClassifier is a type of ensemble learning algorithm that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control overfitting.

In [4]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report 

# imports several metrics classes from the sklearn.metrics module.
#These classes are used to evaluate the performance of machine learning models.
#BRIEF DESCRIPTION :--
#accuracy score: Returns the classification score for accuracy.
#precision score: Returns the precision score, which is the percentage of true positive predictions among all positive predictions.
#recall score: Returns the recall score, which is the percentage of true positive predictions among all actual positives.
#confusion matrix: Creates a table that summarises the number of true positives, true negatives, false positives, and false negatives.
#roc curve: This function computes the receiver operating characteristic (ROC) curve, which is a plot of the true positive rate versus the false positive rate for various classification thresholds.
#classification report: Generates a text summary of a classification problem's precision, recall, and F1 score for each class.

In [5]:
#loading the dataset--
df=pd.read_csv(r"C:\Users\milly\AISC_1006\final_dataset_for_training_sentiment_analysis.csv")

In [6]:
print("Shape of the DataFrame:",df.shape) 
df.head(10)                               
#df.head() helps in printing the first few rows of the dataframe

Shape of the DataFrame: (10000, 3)


Unnamed: 0,text,output,reviews_title
0,"This hotel was nice and quiet. Did not know, t...",3.0,Best Western Plus Hotel
1,"quite good, quite sufficient, quite well",4.0,Clean rooms at solid rates in the heart of Carmel
2,We stayed in the king suite with the separatio...,3.0,Business
3,"Parking was horrible, somebody ran into my ren...",5.0,Very good
4,Not cheap but excellent location. Price is som...,2.0,Low chance to come back here
5,If you get the room that they advertised on th...,4.0,Loved staying here
6,"This is such a fun, lovely hotel. The attentio...",1.0,Does not live up to its reputation
7,We recently stayed at this hotel on a trip to ...,1.0,worst customer service ever
8,"I reserved a room a week in advance, knowing a...",5.0,Location Location Location
9,MoreMore,1.0,The worst place i've booked


In [7]:
df = df.dropna() #dropping the rows which has null values in rows

In [8]:
#After visualising the dataframe, we can see that numbers in 'output' column represents the rating, so converting them into sentiment using replace() func. 
df['output'].replace([1,2,3,4,5], ['negative','negative','neutral','positive','positive'], inplace=True) 
#This can make the data more interpretable and easier to work with in subsequent analysis.

In [9]:
# Encoding/ or replacing again as the machine can take numbers and not words. 
#Encoding with respect to our needs and for easy visualisation. 
df['output'].replace(['positive','neutral','negative'], [ 1, 2, 3],inplace= True)

In [10]:
#creating an object of the WordNetLemmatizer class, which can be used to perform lemmatization on words in text data.
lm = WordNetLemmatizer() 

In [11]:
'''
# FIRST LINE MEANING IN FOR LOOP:-
This is a regular expression operation in Python using the re module. It replaces all non-alphabetic characters in a given string (str(item)) with a space character.
Here is what each component of the operation does:
re is a built-in module in Python that provides support for regular expressions.
sub is a function in the re module that substitutes one or more occurrences of a pattern in a string with another string.
[^a-zA-Z] is a regular expression pattern that matches any character that is not an uppercase or lowercase letter from A to Z. The ^ symbol inside the square brackets means negation, i.e., it matches anything except for the letters A to Z (both lowercase and uppercase).
' ' is the string that is used as a replacement for the matched pattern. In this case, all non-alphabetic characters are replaced with a space character.
So, the operation re.sub('[^a-zA-Z]',' ',str(item)) effectively removes all non-alphabetic characters from str(item) and replaces them with spaces.

# FOURTH LINE MEANING IN FOR LOOP:-
The if statement filters out any words that are stopwords in English.
The stopwords module from the nltk library is used to create a set of stopwords, which are words that are common in text but usually do not carry much meaning (e.g., "a", "an", "the").
'''

def text_transformation(df_col):
    corpus = []  #A new empty list, corpus, is created to store the transformed text data.
    for review in df_col:
        new_review = re.sub('[^a-zA-Z]',' ',str(review))
        new_review = new_review.lower() #transformed text data is then converted to lowercase 
        new_review = new_review.split() #transformed text data is then split into individual words
        new_review = [lm.lemmatize(word) for word in new_review if word not in set(stopwords.words('english'))] 
        corpus.append(' '.join(str(x) for x in new_review)) #The lemmatized and filtered words are then joined back into a single string using the join() method, with a space as the separator. The resulting string is added to the corpus list using the append() method.
    return corpus  #returns the list of transformed text data, corpus.

In [12]:
corpus = text_transformation(df['reviews_title']) #performing text preprocessing on DataFrame column named 'reviews_title' by applying the function text_transformation() which is defined in the above code.

In [13]:
corpus  #printing corpus to see the transformed data after doing text_transformation()

['best western plus hotel',
 'clean room solid rate heart carmel',
 'business',
 'good',
 'low chance come back',
 'loved staying',
 'live reputation',
 'worst customer service ever',
 'location location location',
 'worst place booked',
 'beautiful hotel',
 'good location',
 'construction extremely accomodating',
 'hot water bad food',
 'excellent hotel',
 'historic uncomfortable',
 'accomations',
 'great room great location',
 'good location clean poor service restaurant',
 'great value good location',
 'excellent',
 'good hotel good location',
 'beautiful hotel',
 'nice reasonable',
 'great hotel fantastic service',
 'whitney',
 'horrible closet like room air conditioning',
 'second chance stay',
 'non recommanded stay',
 'nice visit nola',
 'valet parking stole car',
 'disappointing experience',
 'best weekend ever',
 'great hotel experience',
 'great service staff hotel need update',
 'convenient clean quiet',
 'beauty best',
 'new orleans hospitality finest',
 'good location',
 '

In [14]:
cv = CountVectorizer(ngram_range=(1,2))
#The CountVectorizer constructor takes several parameters, including the ngram_range parameter which specifies the range of n-grams to be used for tokenization. In this case, ngram_range=(1,2) means that both single words (unigrams) and pairs of words (bigrams) will be used as tokens.
#A unigram is a single word, and the simplest type of n-gram. For example, in the sentence "The cat is on the mat", the unigrams are "The", "cat", "is", "on", "the", and "mat".
#A bigram is a sequence of two consecutive words. For example, in the same sentence "The cat is on the mat", the bigrams are "The cat", "cat is", "is on", "on the", and "the mat".
traindata = cv.fit_transform(corpus) #fitting and transforming the data to create a sparse matrix of token counts, where each row represents a document and each column represents a unique token.
#Since the matrix is likely to have many zero values (due to the fact that most words in a given text are not repeated), it is represented as a sparse matrix for efficiency.#
X = traindata  #The resulting matrix is stored in the traindata variable
y = df.output  #y is assigned the 'output' column.
cv
#Beyond its construction and application to the data, the CountVectorizer object cv is essentially discarded after the traindata variable is created. It can, however, be useful for transforming new text data with the same vocabulary of tokens learned from the training data.

CountVectorizer(ngram_range=(1, 2))

In [15]:
#Train Test Split

from sklearn.model_selection import train_test_split #module is used to randomly split the dataset into two sets - one for training and the other for testing the model's performance.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) #0.20 means that 20% of the dataset is used for testing the model and the remaining 80% is used for training the model.
#random state: for generating the train and test indices at random. Setting this to a fixed value ensures that the same train-test split is obtained each time the code is run.

In [16]:
from sklearn import metrics
import numpy as np
import itertools
import matplotlib.pyplot as plt

In [17]:
# train a random forest classifier
from sklearn.ensemble import RandomForestClassifier # imports the RandomForestClassifier class from the ensemble module of scikit-learn.
rf = RandomForestClassifier(bootstrap= False,
 max_depth= None,
 max_features= 'auto',
 min_samples_leaf= 2,
 min_samples_split= 5,
 n_estimators= 500,
 random_state = 42) #(n_estimators) parameter sets the number of decision trees in the random forest. In this case, we are using 100 decision trees.
rf.fit(X_train, y_train) #fits the random forest classifier model to the training data.

RandomForestClassifier(bootstrap=False, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=500, random_state=42)

In [18]:
y_pred=rf.predict(X_test) #again predicting 

In [19]:
score = metrics.accuracy_score(y_test, y_pred) #calculating the accuracy, meaning explained in above cells. 
print("accuracy:   %0.3f" % score)

accuracy:   0.801


In [20]:
def expression_check(prediction_input):
    if prediction_input == 1:
        print("Input statement has Positive Sentiment.")
    elif prediction_input == 2:
        print("Input statement has Neutral Sentiment.")
    else:
        print("negative.")
#This function appears to be checking the value of prediction_input to determine if it represents a positive, neutral, or negative sentiment.
#The output of the function is a string message that indicates the sentiment of the input statement.        

In [21]:
# function to take the input statement and perform the same transformations we did earlier
def sentiment_predictor(input):
    input = text_transformation(input) #input text is passed to the function named as text_transformation 
    print(input) 
    print(cv)
    transformed_input = cv.transform(input) #transforming input text into a matrix of token counts
    print(transformed_input)  
    prediction = rf.predict(transformed_input) #storing the predicted values in 'prediction'
    
    #expression_check(prediction) #calling the function to print the final sentiment. 

In [22]:

# Load dataset into a pandas dataframe
df_pred = pd.read_csv(r"C:\Users\milly\Downloads\hotel_reviewsToronto_2023-03-21 00_03_14.csv")
df_pred
# # Create an empty column named "prediction"
# df_pred["prediction"] = ""

# # Loop through each row of the dataframe
# for index, row in df_pred.iterrows():
#     # Apply the "sentiment_predictor" function to the "user_review" column of that row
#     prediction = sentiment_predictor(row["user_review"])
#     # Store the predicted sentiment value in the "prediction" column of that row
#     df_pred.loc[index, "prediction"] = prediction


Unnamed: 0,hotelId,user_review,creation_time,user_id,user_profile_url,user_name,business_id,rating
0,AP1FGmgvrHsAeSyZNQ_dgg,I must have the best friends because they reco...,2021-10-12 14:10:30,qTqtyUbfaL2uZYrk2L0_gw,https://www.yelp.com/user_details?userid=qTqty...,Charles Y.,dMGT_S059U8hzMWxdf90SQ,5.0
1,QPu24FKRWPquScIFxBlkqw,Oh how I've missed this place! This is hands d...,2020-08-01 13:20:02,Hwz-EhpzkEw15zaJurBrWA,https://www.yelp.com/user_details?userid=Hwz-E...,Isabella L.,dMGT_S059U8hzMWxdf90SQ,5.0
2,Vc1D_i-bHfqPG5OC0WZp2A,Very good ice cream place. It is popular with ...,2021-07-04 08:06:00,PO7ZeO648Cpob9OOsxoD1Q,https://www.yelp.com/user_details?userid=PO7Ze...,Nathalie M.,dMGT_S059U8hzMWxdf90SQ,5.0
3,hVOC_Ff4VmNmjeuBTHglYQ,"Hotel is located in the theatre district, and ...",2021-12-20 03:41:13,jNXMV57BYA9B76HneQ9BRg,https://www.yelp.com/user_details?userid=jNXMV...,Denise L.,H3JWDqVp_TQGzZ0PgNCFOQ,4.0
4,v_7YTBN8FDmUMelIJGvJbg,As a couple who were visiting as tourists the ...,2017-11-13 20:03:43,7YeZFEDiHXv9iPtG1dtcbg,https://www.yelp.com/user_details?userid=7YeZF...,Helen B.,H3JWDqVp_TQGzZ0PgNCFOQ,3.0
...,...,...,...,...,...,...,...,...
565,xg7CGp3IlucGhbs-oQbY7g,So let me explain this debacle and incessant h...,2022-09-17 21:33:11,CseoI4dblHJZRGMv7dI0JQ,https://www.yelp.com/user_details?userid=CseoI...,Willus M.,ppoy_jcFLX7K6fy_FYykmQ,1.0
566,bv_luAqbK3EPCq6GDhUC9g,This place is a total DUMP! WOW!\n\nI booked i...,2013-08-09 18:35:28,uB_alU6IMzpYcwdCR_qG1g,https://www.yelp.com/user_details?userid=uB_al...,Amanda S.,ppoy_jcFLX7K6fy_FYykmQ,1.0
567,Q1UQLdzGY8JI-rJcGKf3zA,"This hotel was beautiful. Everything about it,...",2022-12-02 00:17:09,NaA5_dWorPweyYQr5fFB1w,https://www.yelp.com/user_details?userid=NaA5_...,Estefany V.,iWxv5qyiMEhuiUozY6limg,5.0
568,16HQSmK3BMv2bPLy1SLgig,Up here on a business trip. Standard deluxe ro...,2023-02-13 07:07:16,UbezZc9UIk_686GxnSlR4g,https://www.yelp.com/user_details?userid=UbezZ...,Mickey S.,iWxv5qyiMEhuiUozY6limg,2.0


In [29]:
import numpy as np
# function to take the input statement and perform the same transformations we did earlier
def sentiment_predictor(input):
    input = text_transformation(input) #input text is passed to the function named as text_transformation 
    #print(input) 
    #print(cv)
    transformed_input = cv.transform(input) #transforming input text into a matrix of token counts
    #print(transformed_input)  
    prediction = rf.predict(transformed_input) #storing the predicted values in 'prediction'
    print(prediction)
    return prediction
    #expression_check(prediction) #calling the function to print the final sentiment.
    #prediction_col = np.vectorize(expression_check)(prediction)

In [30]:
# create a new column named 'predicted_sentiment'
df_pred['predicted_sentiment'] = sentiment_predictor(df_pred["user_review"])

# print the DataFrame to verify the new column is added
print(df_pred.head())


[1 1 1 1 1 1 3 1 3 1 1 2 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 3 1 2 1 3 1 3 1 1 1 1 1 1 1 3 3 3 1 1 1 1 1 1 1 3 1
 1 1 1 1 3 1 1 1 1 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 2 3 1 1 1 1 1 1 1 3 3 1
 1 1 3 1 1 1 1 1 3 3 1 1 2 1 1 1 1 1 3 1 3 1 1 3 1 1 2 1 1 1 1 1 1 1 1 1 1
 1 1 1 3 1 1 1 1 3 1 1 3 1 1 3 1 3 1 3 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1
 1 1 3 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 3 3 2 1 1 3 1 3 2 1 1 3 1 1 1 1
 1 1 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 2
 3 1 3 1 1 1 1 1 1 1 1 2 1 1 1 1 1 3 1 1 1 1 1 1 1 1 3 1 3 2 1 1 1 1 1 1 1
 1 3 3 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 3 2 1
 1 1 1 1 1 1 1 3 3 1 3 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 1 1
 1 2 1 1 1 3 1 3 3 1 3 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 3 1 1 1 1 3 1 1 1 1 3 3 1 1 1 1 1 1 1 1 3 1
 1 3 3 1 1 1 1 1 1 3 1 1 

In [32]:
df_pred.head(30)

Unnamed: 0,hotelId,user_review,creation_time,user_id,user_profile_url,user_name,business_id,rating,predicted_sentiment
0,AP1FGmgvrHsAeSyZNQ_dgg,I must have the best friends because they reco...,2021-10-12 14:10:30,qTqtyUbfaL2uZYrk2L0_gw,https://www.yelp.com/user_details?userid=qTqty...,Charles Y.,dMGT_S059U8hzMWxdf90SQ,5.0,1
1,QPu24FKRWPquScIFxBlkqw,Oh how I've missed this place! This is hands d...,2020-08-01 13:20:02,Hwz-EhpzkEw15zaJurBrWA,https://www.yelp.com/user_details?userid=Hwz-E...,Isabella L.,dMGT_S059U8hzMWxdf90SQ,5.0,1
2,Vc1D_i-bHfqPG5OC0WZp2A,Very good ice cream place. It is popular with ...,2021-07-04 08:06:00,PO7ZeO648Cpob9OOsxoD1Q,https://www.yelp.com/user_details?userid=PO7Ze...,Nathalie M.,dMGT_S059U8hzMWxdf90SQ,5.0,1
3,hVOC_Ff4VmNmjeuBTHglYQ,"Hotel is located in the theatre district, and ...",2021-12-20 03:41:13,jNXMV57BYA9B76HneQ9BRg,https://www.yelp.com/user_details?userid=jNXMV...,Denise L.,H3JWDqVp_TQGzZ0PgNCFOQ,4.0,1
4,v_7YTBN8FDmUMelIJGvJbg,As a couple who were visiting as tourists the ...,2017-11-13 20:03:43,7YeZFEDiHXv9iPtG1dtcbg,https://www.yelp.com/user_details?userid=7YeZF...,Helen B.,H3JWDqVp_TQGzZ0PgNCFOQ,3.0,1
5,G0m6r3R_dN2557924FKikw,I used to stay here 15 or 20 years ago and sto...,2017-10-26 08:22:20,kvJQl7yuce_Y9ekOv0BbIA,https://www.yelp.com/user_details?userid=kvJQl...,Joseph S.,H3JWDqVp_TQGzZ0PgNCFOQ,4.0,1
6,PAp7I50Wmy0Bu5df7rnOZw,BED BUGS! If you check other reviews they clea...,2022-03-02 04:45:38,RWJ0kGOJRI34IyZXNGLLGA,https://www.yelp.com/user_details?userid=RWJ0k...,Cassandra L.,tsG9J1AFvpIo5nZNlawWzA,1.0,3
7,xudh7jS7hDXSiFH4MWv_MA,"If I can give negative star I would, Let your ...",2019-09-15 16:56:40,cOiPG6aR4sXMD1NAfiHe4w,https://www.yelp.com/user_details?userid=cOiPG...,Jen A.,tsG9J1AFvpIo5nZNlawWzA,1.0,1
8,7L-I66lXFWOvsTFW_ScClA,do not book!\ndirty dirty dirty is what i can ...,2018-10-14 12:12:29,shFlDQN4FYVgshzlTf2TlA,https://www.yelp.com/user_details?userid=shFlD...,Diana Z.,tsG9J1AFvpIo5nZNlawWzA,1.0,3
9,_LcLzOpkBoNOYoxPVwFAnw,This is an iconic hotel of Toronto. I came to ...,2022-06-26 06:41:11,17kCC6LoSz43MVnnOXh0Cw,https://www.yelp.com/user_details?userid=17kCC...,Sara D.,41eBaz8g5r8n1uO9KCIUCQ,3.0,1


In [35]:
#Encoding with respect to our needs and for easy visualisation. 
df_pred['predicted_sentiment'].replace([ 1, 2, 3],['positive','neutral','negative'],inplace= True)

In [36]:
df_pred.to_csv(r"C:\Users\milly\AISC_1006\hotel_reviewsToronto_2023-03-21 00_03_14_pred.csv", index=False)