# **Mount Google Drive for Reading in Data**

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


# **PIP Required Dependencies**

In [2]:
pip install svgling

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting svgling
  Downloading svgling-0.3.1-py3-none-any.whl (21 kB)
Collecting svgwrite (from svgling)
  Downloading svgwrite-1.4.3-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.1/67.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: svgwrite, svgling
Successfully installed svgling-0.3.1 svgwrite-1.4.3


In [3]:
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from tabulate import tabulate
import pprint
import re
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
nltk.download('wordnet')
import math
from sklearn.metrics import accuracy_score

plt.style.use('ggplot')

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
# splitting the data set into training set and test set
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **Set Paths for Reading datasets from Google Drive**

In [4]:

barcelonaPath = '/content/drive/MyDrive/Barcelona_reviews.csv'
londonPath = '/content/drive/MyDrive/London_reviews.csv'
madridPath = '/content/drive/MyDrive/Madrid_reviews.csv'
nyPath = '/content/drive/MyDrive/New_York_reviews.csv'
parisPath = '/content/drive/MyDrive/Paris_reviews.csv'
ndelhiPath = '/content/drive/MyDrive/New_Delhi_reviews.csv'

# **Create Custom Function To Format/Clean Data**

In [5]:
def CleanUprDataFromPath(path, cityName):
  print("Generating Clean Working Dataframe from path: ", path)
  #Read in the data from gdrive
  df = pd.read_csv(path, on_bad_lines='skip')
  #rename 'sample' column to 'sentiment'
  df = df.rename(columns={"sample": "sentiment"})

  #trim down columns (down sample)
  df = df[['sentiment','review_full',]]


  df = df.head(2500)
  print(df.shape[0])

  #make the index column the first column
  #first_column = df.pop("index_column")
  #df.insert(0,'index_column', first_column)

  print("\nChecked for NA's Count is:\n", df.isna().sum(), end="\n")
  print("\n", df.head().to_markdown())

  return df



# **Run Cleaning and Formatting Function on Desired Datasets**

In [6]:
barce_df = CleanUprDataFromPath(barcelonaPath, "Barcelona")
madrid_df = CleanUprDataFromPath(madridPath, "Mardrid")
ny_df = CleanUprDataFromPath(nyPath, "New York City")

Generating Clean Working Dataframe from path:  /content/drive/MyDrive/Barcelona_reviews.csv


FileNotFoundError: ignored

In [None]:
#Combine the Df's
addRev = pd.concat([barce_df, madrid_df, ny_df])
print(addRev.head())

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st

addRev['review_full'] = addRev.review_full.apply(lemmatize_text)

In [None]:
reviews = addRev['review_full'].values
labels = addRev['sentiment']


In [None]:
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [None]:

# creating bag of words model
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(train_sentences)
vocab = vec.get_feature_names_out()
X = X.toarray()
word_counts = {}
for l in range(2):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(X.shape[0]):
    l = train_labels[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += X[i][j]

In [None]:
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)

In [None]:
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data

In [None]:
def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors

In [None]:
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
    return result

In [None]:
labels = [0,1]
n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_sentences)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred))

# NaiveBayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
)

In [None]:
addRev.head()

In [None]:
addRev['sentiment'] = addRev['sentiment'].astype("category")
addRev['review_full'] = addRev['review_full'].astype(str)

addRev.dtypes

In [None]:
train, test = train_test_split(addRev, test_size=0.2)

count_vec = CountVectorizer()
bowArr = count_vec.fit_transform(addRev['review_full'].tolist())
bow = np.array(bowArr.todense())

x = bow
labelss = addRev['sentiment']




In [None]:
labelss.cat.categories

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, labelss,
                                                    test_size=0.2)

In [None]:
model = MultinomialNB().fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
labels = ["Negative", "Positive"]
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot();

In [None]:
def print_top20(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names_out()
    for i, class_label in enumerate(class_labels):
        top20 = np.argsort(clf.feature_log_prob_[0])[20:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top20)))

In [None]:
len(model.feature_log_prob_[0])
cols = count_vec.get_feature_names_out()
feaDf = pd.DataFrame(bowArr.toarray(), columns = cols)


In [None]:
type(cols)

In [None]:
type(model.feature_log_prob_[0])

In [None]:
feaDf = pd.DataFrame()



In [None]:
feaDf['word'] = cols
feaDf['coefficient'] = model.feature_log_prob_[0]
feaDf = feaDf.sort_values(by='coefficient', ascending=False)

In [None]:
feaDf.head(20
           )