# DTSA 5510 Final Project

## Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from itertools import permutations

# old imports
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# latest imports
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB

from langdetect import detect

## Data import

In [None]:
# data import
reviews = pd.read_csv('data/reviews.csv')

# Basic data cleaning

In [None]:
# here is an initial view of a sample of rows in the dataset
reviews.sample(5)

In [None]:
# From the sample dataframe above, the 'NAME' field is misleading, it will be changed from 'NAME' to 'COUNTRY'
reviews.rename(columns={'NAME': 'COUNTRY'}, inplace = True)

In [None]:
# there are 10,000 reviews in our initial dataset.
# The 'REVIEW_TITLE' column has one null entry which we will remove in a subsequent cell
reviews.info()

In [None]:
# Identifying the only row with a null value. Since we really care about the 'REVIEW_TEXT'
# this row could probably be left in but I will remove it just to be safe
reviews[reviews['REVIEW_TITLE'].isnull()]
reviews.dropna(subset = ['REVIEW_TITLE'], axis = 0, inplace = True)

In [None]:
# double checking that there are no more null values in the updated dataset
reviews.isnull().sum()

In [None]:
reviews.head()

In [None]:
# There are two more adjustments that need to be made. The 'REVIEW_DATE' column should be a datetime
# also the 'ID' field should be an object
reviews['REVIEW_DATE'] = pd.to_datetime(reviews['REVIEW_DATE'])
reviews['ID'] = reviews['ID'].astype(object)

## Removing non-english reviews

There is another issue that will impact our results later on in the model testing phase. Even though these reviews are sourced from Amazon.com, there are reviews that are in different languages. A mask will be created to filter our all non English reviews.

In [None]:
# The langdetect package can be used to detect the language of a body of text.
# the package can then be used in a basic function to return the particular language of the review.
def detect_language(text):
    try:
        lang = detect(text)
    except:
        lang = 'unknown'
    return lang

In [None]:
# Running the 'REVIEW_TEXT' column of our dataset through the detect_language function from above
reviews['LANGUAGE'] = reviews['REVIEW_TEXT'].apply(detect_language)

In [None]:
# Here are the results of the  we can see that there are 
reviews['LANGUAGE'].value_counts()

In [None]:
# Example of a portuguese review in the dataset
pd.set_option('display.max_colwidth', None)
reviews['REVIEW_TEXT'][reviews['LANGUAGE'] == 'pt'].head(1)

In [None]:
english_mask = reviews['LANGUAGE'] == 'en'
english_reviews = reviews[english_mask]

In [None]:
english_reviews['word_count'] = english_reviews['REVIEW_TEXT'].apply(lambda x: len(x.split()))

## Visualizations

In [None]:
plt.style.use('fivethirtyeight')
fix, ax = plt.subplots(figsize = (25, 10))

ratings = reviews['RATING'].value_counts()
ind = reviews['RATING'].unique()

plt.bar(ind, ratings, color = 'firebrick', align = 'center')

for p in ax.patches:
    ax.text(p.get_x() + p.get_width() / 2,
            p.get_height(),
            '{:.0f}'.format(p.get_height()),
            ha='center',
            va='bottom'
           )

plt.title('Count of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
# dataset is unbalanced, but that is ok in our context as we will not be exploring the relationship between variables, just the underlying text

In [None]:
reviews.head()

## Visualizations p2

In [None]:
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize = (25, 10))

sns.histplot(data=english_reviews, x='word_count', hue='RATING', multiple='dodge', legend='RATING', kde = True, bins = 50)

plt.xlabel('Word count per review')
plt.ylabel('Count')
plt.title('Number of words by review histogram')

In [None]:
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize = (25, 10))

wc_group = english_reviews.groupby('RATING')['word_count'].apply(list)
data = [wc_group[cat] for cat in english_reviews['RATING'].unique()]

bp = plt.boxplot(data, labels = english_reviews['RATING'].unique(), patch_artist=True)

colors = ['blue', 'black', 'green', 'yellow', 'purple']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

plt.title('Word count boxplot')
plt.xlabel('Review score')
plt.ylabel('Word count')

In [None]:
## graph of average score over time

In [None]:
pd.set_option('display.max_colwidth', None)
english_reviews['REVIEW_TEXT'].sample(1, random_state = 14)

# Count Vectorize

In [None]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ('?', '.', ';', ':', '!', '"', "'", '(', ')', '[', ']', '/', ',', '-'))
    return final

In [None]:
token = RegexpTokenizer(f'[a-zA-Z0-9]+')

In [None]:
test_token = 'This scanner is super easy to use and scans photos very quickly/accurately. The software comes on a CD, which I couldn’t use since I was using my laptop but the software is easy to find online anyway. The one thing to note is you will need a USB port to connect scanner to laptop. A lot of laptops are USBc only these days so that might be tough. Otherwise, a great product.'

In [None]:
token.tokenize(remove_punctuation(test_token))[:10]

In [None]:
english_reviews['REVIEW_TEXT'] = remove_punctuation(english_reviews['REVIEW_TEXT'])

In [None]:
cv = CountVectorizer(stop_words = 'english', ngram_range = (1,1), tokenizer = token.tokenize, max_features = 500)

In [None]:
text_counts = cv.fit_transform(english_reviews['REVIEW_TEXT'])

# Train Test Split P2

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, english_reviews['REVIEW_TEXT'], test_size = 0.2, random_state = 42)

# Training the model 

In [None]:
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [None]:
predicted = MNB.predict(X_test)
accuracy_score = accuracy_score(predicted, Y_test)
print(f'The accuracy score of the Multimodial Naive Bayes model is: {accuracy_score}')

# Train Test Split

In [None]:
# final sample of the english_reviews dataset that will be split into training and testing datasets
english_reviews.sample(5)

In [None]:
english_reviews_train, english_reviews_test = train_test_split(english_reviews, test_size=0.15, random_state=42)

In [None]:
print(f'The number of rows in the training dataset is: {len(english_reviews_train)}')

In [None]:
print(f'The number of rows in the testing dataset is: {len(english_reviews_test)}')

In [None]:
y = english_reviews_train['RATING'].tolist()
english_reviews_train_text = english_reviews_train['REVIEW_TEXT']

# Implementing TF-IDF

In [None]:
tfidf = TfidfVectorizer(ngram_range =(1,2), stop_words = 'english', max_features = 1000)
features = tfidf.fit_transform(english_reviews_train_text)
df_tfidf = pd.DataFrame(features.toarray(), columns = tfidf.get_feature_names_out())

In [None]:
# original review
english_reviews_train.iloc[2]['REVIEW_TEXT']

In [None]:
df_tfidf_three  = df_tfidf.iloc[2]
sorted_doc_three_scores = df_tfidf_three.sort_values(ascending=False)

In [None]:
sorted_doc_three_scores[:20]

In [None]:
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize = (25, 10))

sorted_doc_three_scores[:20].plot(kind = 'bar')
plt.title('TF-IDF score for review 3')
plt.ylabel('TF-IDF score')
plt.xlabel('Terms')

# Unsupervised model

In [None]:
model_nmf = NMF( init = 'random', beta_loss = 'frobenius',  verbose = 1, random_state = 42).fit(df_tfidf)
# max_iter = 100,
# n_components = 5,

In [None]:
weights = model_nmf.transform(df_tfidf)
pd.DataFrame(weights).head()

In [None]:
pred = np.argmax(weights, axis = 1)
pred

In [None]:
# predictions are zero indexed, need to add one to match the y values
pred_adjusted = pred + 1

In [None]:
def label_compare(y, yp):
    '''Function that for all permutations of labels, retruns the best accuracy score and order'''
    global best_ordering
    unique_labels = np.unique(yp)
    best_accuracy = 0.0

    for order in permutations(unique_labels):
        label_map = {unique_labels[i]: order[i] for i in range(len(unique_labels))}
        reordered_yp = np.vectorize(label_map.get)(yp)

        current_score = accuracy_score(y, reordered_yp)
        if current_score > best_accuracy:
            best_accuracy = current_score
            best_ordering = order
        
    return best_accuracy

In [None]:
print(f'The best accuracy achieved with random hyperparameters for the NMF model is: {label_compare(y, pred_adjusted)}')

In [None]:
pred[:10]

In [None]:
y[:10]

In [None]:
labels = np.unique(pred_adjusted)
label_map = dict(zip(labels, best_ordering))
reorder_yp = np.array([label_map[lbl] for lbl in pred_adjusted])

In [None]:
labels

In [None]:
best_ordering

In [None]:
reorder_yp

In [None]:
label_names = english_reviews_train['RATING'].unique().tolist()
l = pd.Index(label_names)
cm = pd.crosstab(y, reorder_yp)
# cm = pd.crosstab(l[y], l[reorder_yp])

plt.figure(figsize=(10, 4))
sns.heatmap(cm, annot=True, square=True, cmap='Reds', fmt='.0f', cbar=False)
plt.title('NMF random model confusion matrix', fontsize = 12)
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.xticks(rotation=20, fontsize=8)
plt.yticks(fontsize=8)

In [None]:
def custom_scoring(estimator, X, y):
    'Creating a scoring function to be used in our GridSearchCV'
    W = estimator.transform(X)
    predictions = np.argmax(W, axis=1)
    return label_compare(y, predictions)

In [None]:
pipe = Pipeline([
    ('tfidfvectorizer', TfidfVectorizer(ngram_range =(1,2), stop_words = 'english', max_features = 1000)),
    ( 'nmf', NMF(n_components = 5, init = 'random', beta_loss = 'frobenius'))
])

params = [{
    # 'tfidfvectorizer__ngram_range': [(1, 2)],
    'tfidfvectorizer__min_df': [1, 2, 3],
    'nmf__alpha_W': [0, .01, .02],
    'nmf__alpha_H': [0, .01, .02]
}]

In [None]:
grid = GridSearchCV(pipe, param_grid = params, scoring = custom_scoring, cv = 10)
x = english_reviews_train['REVIEW_TEXT']
y = english_reviews_train['RATING']
grid.fit(x, y)

In [None]:
cv_results = pd.DataFrame(grid.cv_results_).sort_values(by='mean_test_score')
pd.set_option('display.max_colwidth', None)
cv_results[['mean_fit_time', 'params', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score').iloc[:10]