In [1]:
import os
from pathlib import Path
# Check if the code is running on Google Colab
try:
    import google.colab
    IN_COLAB = True
    base_path = "/content/"
    if Path(f"{base_path}final_project").is_dir():
      %cd {base_path}final_project
      !git pull
      %cd {base_path}
    else:
      !git clone https://github.com/fernandaluft/final_project.git
except ImportError:
    IN_COLAB = False
    base_path = "/workspaces/"

Cloning into 'final_project'...
remote: Enumerating objects: 195, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 195 (delta 9), reused 15 (delta 6), pack-reused 173[K
Receiving objects: 100% (195/195), 139.78 MiB | 37.80 MiB/s, done.
Resolving deltas: 100% (96/96), done.
Updating files: 100% (36/36), done.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from final_project.src.scraping import Scraping
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import pickle
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
scraping = Scraping(IN_COLAB, sentiment_ds = True)
scraping.kaggle_scrape()

In [4]:
class SentimentMLTrain():
  def __init__(self, dataset_limit):
    self.dataset_limit = dataset_limit
    self.stop_words = set(stopwords.words('english'))
    self.lemmatizer = WordNetLemmatizer()
    self.tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
    self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
    self.best_model = None

  def read_sentiment_dataset(self):
    !unzip -o -n /content/imdb-dataset-of-65k-movie-reviews-and-translation.zip -d {base_path}final_project/data
    os.system(f'rm -rf /content/imdb-dataset-of-65k-movie-reviews-and-translation.zip')
    self.sentiment_df = pd.read_csv(f"{base_path}final_project/data/IMDB-Dataset.csv").sample(5000)

  def preprocess_text(self, text):
    text = re.sub(r'\W|\d', ' ', str(text))  # Remove special characters and digits
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    lemmatized_tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]  # Lemmatize and remove stopwords
    return ' '.join(lemmatized_tokens)

  def preprocess_data(self):
    self.sentiment_df['clean_text'] = self.sentiment_df['Reviews'].apply(self.preprocess_text)

  def split_data(self):
    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
        self.tfidf_vectorizer.fit_transform(self.sentiment_df['clean_text']),
        self.sentiment_df['Ratings'],
        test_size=0.2,
        random_state=42)
    with open('/content/final_project/models/tf_idf.pickle', 'wb') as f:
      pickle.dump(self.tfidf_vectorizer, f)

  def train_model(self):
    svm_classifier = SVC(kernel='linear')
    param_grid = {'C': [0.1, 1, 10, 100]}  # Hyperparameter grid for tuning
    grid_search = GridSearchCV(svm_classifier, param_grid, cv=5)
    grid_search.fit(self.X_train, self.y_train)
    self.best_model = grid_search.best_estimator_

  def evaluate_model(self):
    y_pred = self.best_model.predict(self.X_test)
    accuracy = accuracy_score(self.y_test, y_pred)
    print("Accuracy:", accuracy)
    print(classification_report(self.y_test, y_pred))

  def save_model(self, model_path):
    with open(model_path, 'wb') as f:
      pickle.dump(self.best_model, f)

In [5]:
sentiment_ml_train = SentimentMLTrain(None)
sentiment_ml_train.read_sentiment_dataset()

caution:  both -n and -o specified; ignoring -o
Archive:  /content/imdb-dataset-of-65k-movie-reviews-and-translation.zip
  inflating: /content/final_project/data/IMDB-Dataset.csv  


In [6]:
sentiment_ml_train.sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 144037 to 130582
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Ratings   1000 non-null   float64
 1   Reviews   1000 non-null   object 
 2   Movies    999 non-null    object 
 3   Resenhas  1000 non-null   object 
dtypes: float64(1), object(3)
memory usage: 39.1+ KB


In [7]:
sentiment_ml_train.preprocess_data()
sentiment_ml_train.split_data()
sentiment_ml_train.train_model()
sentiment_ml_train.evaluate_model()

Accuracy: 0.265
              precision    recall  f1-score   support

         1.0       0.52      0.52      0.52        25
         2.0       0.19      0.28      0.23        18
         3.0       0.29      0.32      0.30        19
         4.0       0.24      0.25      0.24        20
         5.0       0.12      0.27      0.17        15
         6.0       0.21      0.22      0.22        18
         7.0       0.13      0.09      0.11        22
         8.0       0.20      0.08      0.12        24
         9.0       0.39      0.44      0.41        16
        10.0       0.38      0.22      0.28        23

    accuracy                           0.27       200
   macro avg       0.27      0.27      0.26       200
weighted avg       0.28      0.27      0.26       200



In [8]:
sentiment_ml_train.save_model("/content/final_project/models/sentiment_model.pkl")

In [9]:
!unzip -o -n /content/final_project/preprocessed_data/xaa_books_reviews.zip -d {base_path}final_project/data

caution:  both -n and -o specified; ignoring -o
Archive:  /content/final_project/preprocessed_data/xaa_books_reviews.zip
  inflating: /content/final_project/data/content/final_project/data/books_reviews.csv  


In [12]:
import pickle

def calculate_sentiment_book(title):
  sentiment_ml = SentimentMLTrain(None)
  n_neg = 0
  n_pos = 0
  with open("/content/final_project/models/sentiment_model.pkl", "rb") as f:
    sentiment_model = pickle.load(f)

  with open("/content/final_project/models/tf_idf.pickle", "rb") as f:
    vec = pickle.load(f)
  books = pd.read_csv("/content/final_project/data/content/final_project/data/books_reviews.csv")


  books_subset = books[books.Title == title]['review/text']

  for rev, rev2 in books_subset.items():

    processed = sentiment_ml.preprocess_text(rev2)
    processed = vec.transform([processed])

    score = sentiment_model.predict(processed)
    if score >= 5:
      n_pos += 1
    else:
      n_neg += 1

  return [n_neg, n_pos]

In [11]:
print(calculate_sentiment_book('Run Baby Run'))

  (0, 4943)	0.14723072167214837
  (0, 4934)	0.12095300774538859
  (0, 4907)	0.12156474474597473
  (0, 4521)	0.12512474271979876
  (0, 4497)	0.05331095556624568
  (0, 4393)	0.13826479363361338
  (0, 4207)	0.11099748855323574
  (0, 4205)	0.1451889238594452
  (0, 3559)	0.3099455944161144
  (0, 3236)	0.13151126248020392
  (0, 3189)	0.10749064973343528
  (0, 3074)	0.27652958726722676
  (0, 3064)	0.08116472120279151
  (0, 3033)	0.19880198289466547
  (0, 2769)	0.15183333937650076
  (0, 2430)	0.04774393122814306
  (0, 2424)	0.08059269382847838
  (0, 2207)	0.13983960294176587
  (0, 2164)	0.16853338826413944
  (0, 2117)	0.1451889238594452
  (0, 2013)	0.11182137382825734
  (0, 1963)	0.1155842568290911
  (0, 1585)	0.1281333647209562
  (0, 1572)	0.06427267389239831
  (0, 1185)	0.17355665020279212
  (0, 1179)	0.12241255508989692
  (0, 932)	0.1432880555722661
  (0, 783)	0.1155842568290911
  (0, 722)	0.1606072689922812
  (0, 700)	0.2988720626258946
  (0, 618)	0.16428628081063737
  (0, 499)	0.164286280