# Using the model
Using the trained models on the dataset scraped from Tripadvisor

In [1]:
import os

In [2]:
import pandas as pd
import numpy as np

Assessing the results using Confusion Matrix and Classification Report

In [3]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

Importing data preprocessor functions

In [4]:
from data_preprocessor import read_file, reviews_to_list, text_preprocessing, form_corpus

[nltk_data] Downloading package punkt to /Users/farhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/farhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/farhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/farhan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /Users/farhan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Import data exporter

In [None]:
from export_data import export_data

Importing the models and the feature extractor

In [None]:
random_forest_classifier = pd.read_pickle('trained_models/RandomForest')
logistic_regression_classifier = pd.read_pickle('trained_models/LogisticRegression')
sgd_classifier = pd.read_pickle('trained_models/SGD')
mnb_classifier = pd.read_pickle('trained_models/MNB')
vectorizer = pd.read_pickle('feature_extractor/vectorizer')

# Using the trained models on our own dataset
In this section, we will be using the dataset scraped from __Tripadvisor__ to determine the hotel review sentiment.

## Reading the files

In [None]:
dataset_mbs_good = read_file('processed_tripadvisor_data/tripadvisor_mbs_good.csv')
dataset_mbs_bad = read_file('processed_tripadvisor_data/tripadvisor_mbs_bad.csv')

dataset_shangrila_good = read_file('processed_tripadvisor_data/tripadvisor_shangrila_good.csv')
dataset_shangrila_bad = read_file('processed_tripadvisor_data/tripadvisor_shangrila_bad.csv')

dataset_fullerton_good = read_file('processed_tripadvisor_data/tripadvisor_fullerton_good.csv')
dataset_fullerton_bad = read_file('processed_tripadvisor_data/tripadvisor_fullerton_bad.csv')

Number of Good and Bad reviews for each hotel

In [None]:
print(f'Number of good MBS reviews: {dataset_mbs_good.shape}')
print(f'Number of bad MBS reviews: {dataset_mbs_bad.shape}')

print(f'Number of good Shangri-La reviews: {dataset_shangrila_good.shape}')
print(f'Number of bad Shangri-La reviews: {dataset_shangrila_bad.shape}')

print(f'Number of good Fullerton reviews: {dataset_fullerton_good.shape}')
print(f'Number of bad Fullerton reviews: {dataset_fullerton_bad.shape}')


Combine the positive and negative datasets

In [None]:
dataset_mbs = pd.concat([dataset_mbs_good, dataset_mbs_bad]).reset_index()
dataset_shangrila = pd.concat([dataset_shangrila_good, dataset_shangrila_bad]).reset_index()
dataset_fullerton = pd.concat([dataset_fullerton_good, dataset_fullerton_bad]).reset_index()

In [None]:
dataset_mbs.head()

Create a new column in the dataset, `reviews`, for preprocessing

In [None]:
dataset_mbs["reviews"] = dataset_mbs["reviewContent"]
dataset_shangrila["reviews"] = dataset_shangrila["reviewContent"]
dataset_fullerton["reviews"] = dataset_fullerton["reviewContent"]

## Data Preprocessing

Convert the `reviews` column to a list

In [None]:
mbs_reviews = reviews_to_list(dataset_mbs)
shangrila_reviews = reviews_to_list(dataset_shangrila)
fullerton_reviews = reviews_to_list(dataset_fullerton)

In [None]:
mbs_reviews_processed = [text_preprocessing(review) for review in mbs_reviews]
shangrila_reviews_processed = [text_preprocessing(review) for review in shangrila_reviews]
fullerton_reviews_processed = [text_preprocessing(review) for review in fullerton_reviews]

Form review corpuses

In [None]:
mbs_corpus = form_corpus(mbs_reviews_processed)
shangrila_corpus = form_corpus(shangrila_reviews_processed)
fullerton_corpus = form_corpus(fullerton_reviews_processed)

Extract features from reviews

In [None]:
mbs_features = vectorizer.transform(mbs_corpus).toarray()
shangrila_features = vectorizer.transform(shangrila_corpus).toarray()
fullerton_features = vectorizer.transform(fullerton_corpus).toarray()

print(mbs_features.shape)
print(shangrila_features.shape)
print(fullerton_features.shape)

Retrieve "correct" labels

In [None]:
mbs_labels = dataset_mbs['sentiment'].tolist()
shangrila_labels = dataset_shangrila['sentiment'].tolist()
fullerton_labels = dataset_fullerton['sentiment'].tolist()

## Applying the processed data to our models

Random Forest Classifier

In [None]:
random_forest_mbs = random_forest_classifier.predict(mbs_features)
random_forest_shangrila = random_forest_classifier.predict(shangrila_features)
random_forest_fullerton = random_forest_classifier.predict(fullerton_features)

print(f'Accuracy for MBS using Random Forest Classifier: {accuracy_score(mbs_labels, random_forest_mbs)}')
print(f'Accuracy for Shangri-La using Random Forest Classifier: {accuracy_score(shangrila_labels, random_forest_shangrila)}')
print(f'Accuracy for Fullerton using Random Forest Classifier: {accuracy_score(fullerton_labels, random_forest_fullerton)}')

In [None]:
print(confusion_matrix(mbs_labels, random_forest_mbs))

In [None]:
print(classification_report(mbs_labels, random_forest_mbs))
print(classification_report(shangrila_labels, random_forest_shangrila))
print(classification_report(fullerton_labels, random_forest_fullerton))

Logistic Regression Classifier

In [None]:
logistic_regression_mbs = logistic_regression_classifier.predict(mbs_features)
logistic_regression_shangrila = logistic_regression_classifier.predict(shangrila_features)
logistic_regression_fullerton = logistic_regression_classifier.predict(fullerton_features)

print(f'Accuracy for MBS using Logistic Regression Classifier: {accuracy_score(mbs_labels, logistic_regression_mbs)}')
print(f'Accuracy for Shangri-La using Logistic Regression Classifier: {accuracy_score(shangrila_labels, logistic_regression_shangrila)}')
print(f'Accuracy for Fullerton using Logistic Regression Classifier: {accuracy_score(fullerton_labels, logistic_regression_fullerton)}')

In [None]:
print(classification_report(mbs_labels, logistic_regression_mbs))
print(classification_report(shangrila_labels, logistic_regression_shangrila))
print(classification_report(fullerton_labels, logistic_regression_fullerton))

SGD Classifier

In [None]:
sgd_mbs = sgd_classifier.predict(mbs_features)
sgd_shangrila = sgd_classifier.predict(shangrila_features)
sgd_fullerton = sgd_classifier.predict(fullerton_features)

print(f'Accuracy for MBS using SGD Classifier: {accuracy_score(mbs_labels, sgd_mbs)}')
print(f'Accuracy for Shangri-La using SGD Classifier: {accuracy_score(shangrila_labels, sgd_shangrila)}')
print(f'Accuracy for Fullerton using SGD Classifier: {accuracy_score(fullerton_labels, sgd_fullerton)}')

In [None]:
sgd_mbs_proba = sgd_classifier.predict_proba(mbs_features)
sgd_shangrila_proba = sgd_classifier.predict_proba(shangrila_features)
sgd_fullerton_proba = sgd_classifier.predict_proba(fullerton_features)

In [None]:
print(confusion_matrix(mbs_labels, sgd_mbs))
print(confusion_matrix(shangrila_labels, sgd_shangrila))
print(confusion_matrix(fullerton_labels, sgd_fullerton))

In [None]:
print(classification_report(mbs_labels, sgd_mbs))
print(classification_report(shangrila_labels, sgd_shangrila))
print(classification_report(fullerton_labels, sgd_fullerton))

__MNB Classifier__

In [None]:
mnb_mbs = mnb_classifier.predict(mbs_features)
mnb_shangrila = mnb_classifier.predict(shangrila_features)
mnb_fullerton = mnb_classifier.predict(fullerton_features)

print(f'Accuracy for MBS using MNB Classifier: {accuracy_score(mbs_labels, mnb_mbs)}')
print(f'Accuracy for Shangri-La using MNB Classifier: {accuracy_score(shangrila_labels, mnb_shangrila)}')
print(f'Accuracy for Fullerton using MNB Classifier: {accuracy_score(fullerton_labels, mnb_fullerton)}')

In [None]:
mnb_mbs_proba = mnb_classifier.predict_proba(mbs_features)
mnb_shangrila_proba = mnb_classifier.predict_proba(shangrila_features)
mnb_fullerton_proba = mnb_classifier.predict_proba(fullerton_features)

In [None]:
print(classification_report(mbs_labels, mnb_mbs))
print(classification_report(shangrila_labels, mnb_shangrila))
print(classification_report(fullerton_labels, mnb_fullerton))

## Exporting the classified and labelled data

In [None]:
def lower(string: str) -> str:
    """ Convert the labels to lowercase (For ease of parsing to the web application). Returns string. """
    return string.lower()

path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + '/cleanedData'

export_data(path, 'tripadvisor_mbs', dataset_mbs, ['reviewerName', 'reviewDate', 'reviewContent', 'reviewRating'], list(map(lower, sgd_mbs)), sgd_mbs_proba, mbs_corpus)
export_data(path, 'tripadvisor_shangrila', dataset_shangrila, ['reviewerName', 'reviewDate', 'reviewContent', 'reviewRating'], list(map(lower, sgd_shangrila)), sgd_shangrila_proba, shangrila_corpus)
export_data(path, 'tripadvisor_fullerton', dataset_fullerton, ['reviewerName', 'reviewDate', 'reviewContent', 'reviewRating'], list(map(lower, sgd_fullerton)), sgd_fullerton_proba, fullerton_corpus)