# Set-up

Mount Google Drive so we have access to the dataset, the files defining our model, and so that we can save the results of training.

In [None]:
# Importing necessary module to mount Google Drive
from google.colab import drive

# Mounting Google Drive to access files
drive.mount('/content/drive')

# Importing sys module
import sys

# Defining the foldername for Weights and Biases project
foldername = '/WandB/week_1/'

# Appending the path to the project folder in Google Drive
sys.path.append(f'/content/drive/My Drive/{foldername}')

# Changing the current directory to the project folder in Google Drive
%cd /content/drive/My\ Drive/$foldername

Install depedencies that were used for training the Huggingface transformer model on the Kaggle Goodreads dataset.

In [None]:
# Install required packages
!pip3 install wandb transformers[sentencepiece] datasets evaluate 

# Import necessary modules
import wandb, transformers, torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import params

# Ensure that plots are displayed inline
%matplotlib inline

# Check if CUDA is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Enable autoreload for code changes
%load_ext autoreload
%autoreload 2

# Import functions for loading datasets and metrics from Hugging Face
from datasets import load_from_disk, load_dataset, Dataset, load_metric

# Import required classes and functions from the Transformers library
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

# Import the evaluate module
import evaluate

# Import the EarlyStoppingCallback and TrainerCallback from transformers
from transformers.trainer_callback import (EarlyStoppingCallback, 
                                           TrainerCallback)

# Import the params module
import params

# Import notebook_login function from huggingface_hub
from huggingface_hub import notebook_login

# Log in to the Hugging Face Hub
notebook_login()

# EDA

In this section we will perform download and process the dataset of Goodreads reviews and then perform exploratory data analysis.

## Log raw data

In [None]:
# Start wandb run for downloading raw data.
run = wandb.init(project=params.WANDB_PROJECT,
                 entity=None,
                 job_type=params.RAW_DATA_JOB_TYPE,
                 name='raw_data_2')

In [None]:
# Define the path to the training data CSV file
train_path = '/content/drive/My Drive/WandB/assignment_1/data/raw_data/goodreads_train.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(train_path)  

# Create a new Weights and Biases (WandB) Artifact for the raw data
raw_data_art = wandb.Artifact(params.RAW_DATA_ARTIFACT, type=params.DATASET_TYPE)

# Add the CSV file to the Artifact
raw_data_art.add_file('./data/raw_data/goodreads_train.csv')

# Log the Artifact in the current WandB run
run.log_artifact(raw_data_art)

# Finish the current WandB run
run.finish()

## Downsample and Tokenize

In [None]:
# Initialize a WandB run with specified project, entity, job type, and name
run = wandb.init(project=params.WANDB_PROJECT,
                 entity=None,
                 job_type=params.PROCESSED_DATA_ARTIFACT,
                 name='processed_data')

# Use the latest version of the raw data artifact
raw_data_at = run.use_artifact(f'{params.RAW_DATA_ARTIFACT}:latest')

# Check if the DataFrame 'df' is empty
if df.empty:
    # If 'df' is empty, download the raw data
    path = raw_data_at.download()
    # Load the downloaded data into the DataFrame 'df'
    df = pd.read_csv(path + '/goodreads_train.csv')

Let's next count the number of reviews of a given rating.

In [None]:
df.rating.value_counts()

In [None]:
# Create a histogram of the 'rating' column in the DataFrame 'df'
hist_plot = df['rating'].hist(bins=np.arange(7)-.5, rwidth=.5)

# Add labels to the x and y axes
plt.xlabel('rating')
plt.ylabel('frequency')

# Adjust the layout for better visualization
plt.tight_layout()

# Display the histogram plot
plt.show()

# Save the figure as 'Rating_Frequencies' (Note: This should be before showing or logging the plot)
plt.savefig('Rating_Frequencies')

# Log the histogram plot to WandB under the key "Rating Frequencies"
wandb.log({"Rating Frequencies": hist_plot})

Below we print the total number of reviews (90k), the number of unique books(25k) and the unique number of user (12k).

In [None]:
print(df.shape)
print(df.book_id.nunique())
print(df.user_id.nunique())

In [None]:
# Convert the 'review_text' column to lowercase for consistent text processing
df['review_text'] = df.loc[:, 'review_text'].map(lambda x: x.lower())

# Remove extra whitespaces and join words together for better text processing
df['review_text'] = df.loc[:, 'review_text'].map(lambda x: ' '.join(x.split()).strip())

# Find the 100 most common reviews and create a DataFrame with 'Frequency' and 'review_text' columns
common_reviews = pd.DataFrame(df['review_text'].value_counts()[:100]).reset_index().rename(columns={'review_text': 'Frequency', 'index': 'review_text'})

# Log the common reviews as a wandb.Table with the key 'Common Reviews'
wandb.log({'Common Reviews': wandb.Table(dataframe=common_reviews)})

# Print the size of the training set before and after removing duplicate reviews
print(f'Size of training set before removal of duplicate reviews: {len(df)}')
df.drop_duplicates(subset=['review_text'], inplace=True, keep='first')
print(f'Size of training set after removal of duplicate reviews: {len(df)}')

In [None]:
# Import necessary modules for undersampling
import imblearn
from imblearn.under_sampling import RandomUnderSampler

# Initialize a RandomUnderSampler with a specified random state
undersample = RandomUnderSampler(random_state=42)

# Perform undersampling to balance the classes, returning a downsampled DataFrame and balanced labels
df, y_bal = undersample.fit_resample(df.drop(columns=['rating']), df['rating'])

# Update the 'rating' column with the balanced labels
df['rating'] = y_bal

# Delete the variable holding balanced labels to free up memory
del y_bal

# Generate a random permutation of the DataFrame indices
random_perm = np.random.permutation(len(df))

# Apply the permutation to shuffle the DataFrame
df = df.iloc[random_perm]

# Reset the DataFrame index after shuffling
df.reset_index(inplace=True)

# Drop the old index column
df.drop(columns='index', inplace=True)

# Create a histogram of the downsampled ratings for visualization
hist_plot_downsampled = df['rating'].hist(bins=np.arange(7)-.5, rwidth=.5)
plt.xlabel('rating')
plt.ylabel('frequency')
plt.tight_layout()
plt.show()

# Log the histogram as 'Frequency of Ratings After Downsampling' in the wandb project
wandb.log({'Frequency of Ratings After Downsampling': hist_plot_downsampled})

Count the number of unique books (23k) and user ids (11k) after undersampling. 

In [None]:
print(df.book_id.nunique())
print(df.user_id.nunique())
print(len(df))

In [None]:
# Calculate the length of each review and create a new column 'full_length' to store the results
df['full_length'] = df['review_text'].map(lambda x: len(x))

# Calculate the mean word length in each review and create a new column 'mean_word_length' to store the results
df['mean_word_length'] = df['review_text'].map(lambda x: np.mean(list(map(len, x.split()))))

# Create a new wandb Artifact for the processed data
processed_data_art = wandb.Artifact(params.PROCESSED_DATA_ARTIFACT, type=params.DATASET_TYPE)

# Add the directory containing the processed data to the Artifact
processed_data_art.add_dir(params.PROCESSED_DATA_FOLDER)

# Log the processed data Artifact to the wandb run
run.log_artifact(processed_data_art)

# Finish the wandb run
run.finish()

## Splitting Data

In [None]:
# Start a new wandb run for splitting data.

run = wandb.init(project=params.WANDB_PROJECT, job_type=params.SPLIT_DATA_JOB_TYPE,name='split_data')
run.use_artifact(f'{params.PROCESSED_DATA_ARTIFACT}:latest')

In [None]:
# Initialize GroupShuffleSplit for train/test split with 80% for training
# and 20% for testing, using a specific random seed for reproducibility
gs_test = GroupShuffleSplit(n_splits=2, train_size=.8, random_state=42)

# Initialize GroupShuffleSplit for train/validation split with 75% for training
# and 25% for validation, using a specific random seed for reproducibility
gs_valid = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=43)

# Generate train and test indices for the data
train_idx, test_idx = next(iter(gs_test.split(df, groups=df.book_id)))

# Create a new DataFrame for training data using the generated indices
train_df = df.loc[train_idx].reset_index(drop=True)

# Create a new DataFrame for test data using the generated indices
test_df = df.loc[test_idx].reset_index(drop=True)

# Generate train and validation indices for the training data
train_idx, valid_idx = next(iter(gs_valid.split(train_df, groups=train_df.book_id)))

# Create a new DataFrame for validation data using the generated indices
valid_df = train_df.loc[valid_idx]

# Modify the existing train DataFrame to exclude validation samples
train_df = train_df.loc[train_idx]

# Reset indices for all DataFrames to maintain consistency
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
# Calculate the proportion of data allocated for training by dividing the length of the training set
# by the total length of the original dataset.
train_proportion = len(train_df) / len(df)

# Print the proportion of data allocated for training.
print(train_proportion)

# Calculate the proportion of data allocated for testing by dividing the length of the test set
# by the total length of the original dataset.
test_proportion = len(test_df) / len(df)

# Print the proportion of data allocated for testing.
print(test_proportion)

# Calculate the proportion of data allocated for validation by dividing the length of the validation set
# by the total length of the original dataset.
valid_proportion = len(valid_df) / len(df)

# Print the proportion of data allocated for validation.
print(valid_proportion)


In [None]:
# Initialize a tokenizer using the specified pretrained model name.
tokenizer = AutoTokenizer.from_pretrained(params.MODEL_NAME)

def tokenize_batch(batch):
    """
    Tokenizes a batch of input text.

    Parameters:
    -----------
        batch (dict): A dictionary containing the input text to be tokenized.

    Returns:
    --------
        tokenized_batch (dict): A dictionary containing the tokenized inputs.

    """
    # Tokenize the input batch, adding padding and truncation as specified.
    tokenized_batch = tokenizer(batch['review_text'], padding=True, truncation=True)
    return tokenized_batch

In [None]:
# Create datasets from the pandas DataFrames.
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the datasets using the specified batch tokenization function.
train_dataset = train_dataset.map(tokenize_batch, batched=True)
valid_dataset = valid_dataset.map(tokenize_batch, batched=True)
test_dataset = test_dataset.map(tokenize_batch, batched=True)

In [None]:
# Save the datasets to disk in the specified directories.
train_dataset.save_to_disk(params.TRAIN_DATA_FOLDER)
valid_dataset.save_to_disk(params.VALID_DATA_FOLDER)
test_dataset.save_to_disk(params.TEST_DATA_FOLDER)

# Create WandB Artifacts for the train, valid, and test datasets.
train_data_art = wandb.Artifact(params.TRAIN_DATA_ARTIFACT, type=params.DATASET_TYPE)
valid_data_art = wandb.Artifact(params.VALID_DATA_ARTIFACT, type=params.DATASET_TYPE)
test_data_art = wandb.Artifact(params.TEST_DATA_ARTIFACT, type=params.DATASET_TYPE)

# Add the directories containing the datasets to the respective artifacts.
train_data_art.add_dir(params.TRAIN_DATA_FOLDER)
valid_data_art.add_dir(params.VALID_DATA_FOLDER)
test_data_art.add_dir(params.TEST_DATA_FOLDER)

# Log the artifacts in the run.
run.log_artifact(train_data_art)
run.log_artifact(valid_data_art)
run.log_artifact(test_data_art)

# Finish the run.
run.finish()


# Baseline Model

In this section we will train two models, DistilBERT and BERT-tiny and measure how they do on the classifying the Goodreads dataset.

## Distilbert

In [None]:
run = wandb.init(project=params.WANDB_PROJECT, job_type=params.MODEL_TRAINING_JOB_TYPE,name='baseline_training_batch')

In [None]:
# Load train and validation datasets from disk.
train_dataset = load_from_disk('./data/train_data')
valid_dataset = load_from_disk('./data/valid_data')

# Initialize a tokenizer for tokenizing the text.
tokenizer = AutoTokenizer.from_pretrained(params.MODEL_NAME)

# Define a function to tokenize a batch of text.
def tokenize_batch(batch):
    tokenized_batch = tokenizer(batch['review_text'], padding=True, truncation=True)
    return tokenized_batch

# Create a data collator for padding the tokenized inputs.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define a list of columns to drop from the datasets.
drop_cols = [col for col in list(train_dataset.features) if col not in ['input_ids', 'attention_mask', 'rating']]

# Remove the specified columns from the datasets.
train_dataset = train_dataset.remove_columns(drop_cols)
valid_dataset = valid_dataset.remove_columns(drop_cols)

# Rename the 'rating' column to 'labels' for compatibility with the model.
train_dataset = train_dataset.rename_column('rating', 'labels')
valid_dataset = valid_dataset.rename_column('rating', 'labels')

# Set the format of the datasets to use PyTorch tensors.
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
valid_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Initialize a model for sequence classification.
model = AutoModelForSequenceClassification.from_pretrained(params.MODEL_NAME, num_labels=params.NUM_CLASSES)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move the model to the specified device (e.g., GPU).
model.to(device)


In [None]:
# Load evaluation metrics for accuracy, F1 score, recall, and precision.
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')
recall_metric = evaluate.load('recall')
precision_metric = evaluate.load('precision')

# Define a function to compute evaluation metrics.
def compute_metrics(eval_pred):
    """
    Computes evaluation metrics.

    Args:
        eval_pred (tuple): Tuple containing logits and labels.

    Returns:
        dict: Dictionary containing computed metrics.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return acc_metric.compute(predictions=predictions, references=labels)

# Define training arguments for the Trainer.
training_args = TrainingArguments(
    output_dir=params.MODEL_DATA_FOLDER,
    num_train_epochs=3,
    per_device_train_batch_size=params.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=params.VALID_BATCH_SIZE,
    warmup_steps=params.WARMUP_STEPS,
    fp16=params.FP16,
    learning_rate=float(params.LEARNING_RATE),
    logging_dir=f"{params.MODEL_DATA_FOLDER}/logs",
    logging_steps=100,
    evaluation_strategy='steps',
    save_steps=2000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='wandb',
    push_to_hub=params.PUSH_TO_HUB,
    hub_strategy=params.HUB_STRATEGY,
    hub_model_id=params.HUB_MODEL_ID
)

# Initialize the Trainer for training the model.
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./"+params.MODEL_DATA_FOLDER)
run.log({"Performance-data": wandb.Table(dataframe=pd.DataFrame(outputs, 
                                                                index=["Performance"]))})
model.push_to_hub(params.HUB_MODEL_ID)
tokenizer.push_to_hub(params.HUB_MODEL_ID)

In [None]:
# Import necessary module for interacting with the Hugging Face Model Hub.
from huggingface_hub import HfApi

# Create an instance of the Hugging Face API.
hf_api = HfApi()

# Retrieve information about the currently authenticated user.
user = hf_api.whoami()

# Create a Wandb Artifact for the trained model.
trained_model_art = wandb.Artifact(params.HUB_MODEL_ID, type=params.MODEL_TYPE)

# Construct the hub_id by combining the user's name and the model's identifier.
hub_id = f"{user['name']}/{params.HUB_MODEL_ID}"

# Add metadata to the Artifact, specifying the hub_id.
trained_model_art.metadata = {"hub_id": hub_id}

# Log the trained model Artifact.
run.log_artifact(trained_model_art)

# Finish the run.
run.finish()

## Bert Tiny

In [None]:
run = wandb.init(project=params.WANDB_PROJECT, job_type=params.MODEL_TRAINING_JOB_TYPE,name='baseline_bert_tiny')

In [None]:
# Load training and validation datasets from disk.
train_dataset = load_from_disk('./data/train_data')
valid_dataset = load_from_disk('./data/valid_data')

# Initialize the tokenizer using the 'prajjwal1/bert-tiny' model.
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')

def tokenize_batch(batch):
    """
    Tokenizes a batch of text data.

    Args:
        batch (dict): A dictionary containing the input text data.

    Returns:
        dict: Tokenized batch containing 'input_ids' and 'attention_mask'.
    """
    tokenized_batch = tokenizer(batch['review_text'], padding=True,
                                truncation=True)
    return tokenized_batch

# Initialize the data collator with padding.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Drop columns from the datasets that are not needed for processing.
drop_cols = [col for col in list(train_dataset.features) if col not in ['input_ids', 'attention_mask', 'rating']]
train_dataset = train_dataset.remove_columns(drop_cols)
valid_dataset = valid_dataset.remove_columns(drop_cols)

# Rename the 'rating' column to 'labels' for compatibility.
train_dataset = train_dataset.rename_column('rating', 'labels')
valid_dataset = valid_dataset.rename_column('rating', 'labels')

# Set the format of the datasets for PyTorch compatibility.
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
valid_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Initialize the model for sequence classification using 'prajjwal1/bert-tiny'.
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny',
                                                           num_labels=params.NUM_CLASSES)

# Move the model to the specified device (e.g., 'cuda' or 'cpu').
model.to(device)

In [None]:
# Load evaluation metrics
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')
recall_metric = evaluate.load('recall')
precision_metric = evaluate.load('precision')

def compute_metrics(eval_pred):
    """
    Computes evaluation metrics based on model predictions.

    Args:
        eval_pred (tuple): Tuple containing logits and labels.

    Returns:
        dict: Dictionary containing computed metrics.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return acc_metric.compute(predictions=predictions, references=labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='tinybert-goodreads-model',
    num_train_epochs=6,
    per_device_train_batch_size=params.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=params.VALID_BATCH_SIZE,
    warmup_steps=params.WARMUP_STEPS,
    fp16=params.FP16,
    learning_rate=float(params.LEARNING_RATE),
    logging_dir=f"tinybert-goodreads-model/logs",
    logging_steps=1000,
    evaluation_strategy='steps',
    save_steps=2000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='wandb'
    #,push_to_hub=params.PUSH_TO_HUB,
    #hub_strategy=params.HUB_STRATEGY,
    #hub_model_id=params.HUB_MODEL_ID
)

# Initialize the Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    #callbacks=[EarlyStoppingCallback()],
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
outputs = trainer.evaluate()

# Save the trained model
trainer.save_model("./tinybert-goodreads-model")

# Log performance metrics to WandB
run.log({"Performance-data": wandb.Table(dataframe=pd.DataFrame(outputs, index=["Performance"]))})

# Push the model and tokenizer to the Hugging Face Model Hub
model.push_to_hub('dhmeltzer/tinybert-goodreads-wandb')
tokenizer.push_to_hub('dhmeltzer/tinybert-goodreads-wandb')

In [None]:
# Import necessary module
from huggingface_hub import HfApi

# Create an instance of HfApi
hf_api = HfApi()

# Get user information
user = hf_api.whoami()

# Create a WandB artifact for the trained model
trained_model_art = wandb.Artifact('tinybert-goodreads-wandb', type=params.MODEL_TYPE)

# Define the hub_id for the model
hub_id = f"{user['name']}/distilbert-goodreads-wandb"

# Add metadata to the artifact
trained_model_art.metadata = {"hub_id": hub_id}

# Log the artifact
run.log_artifact(trained_model_art)

# Finish the run
run.finish()

# Extra EDA

In this section we will continue to perform exploratory data analysis in order to get a better understanding of the dataset.

In [None]:
# Import necessary libraries
import re  # Regular expression module for string manipulation
import string  # Module for handling string operations
import os  # Module for interacting with the operating system
import seaborn as sns  # Data visualization library based on Matplotlib
from wordcloud import WordCloud, STOPWORDS  # Word cloud generation library
import nltk  # Natural Language Toolkit for NLP tasks
from nltk.tokenize import word_tokenize, sent_tokenize  # Tokenization functions
from nltk.corpus import stopwords, inaugural  # Stopwords corpus for text cleaning

from textblob import TextBlob  # Library for processing textual data

from nltk.corpus import wordnet  # Lexical database for English
from nltk import pos_tag  # Part-of-speech tagging
from nltk.stem import WordNetLemmatizer  # Word lemmatization
from nltk.stem import PorterStemmer  # Word stemming

nltk.download('stopwords')  # Download stopwords data
from nltk.stem import PorterStemmer  # Stemming module

from nltk.util import ngrams  # Function for creating n-grams
from sklearn.feature_extraction.text import CountVectorizer  # Text vectorization method

run = wandb.init(project=params.WANDB_PROJECT,entity=None, job_type='validating_split')

In [None]:
# Load the latest version of the training data artifact
train_data_art = run.use_artifact(f'{params.TRAIN_DATA_ARTIFACT}:latest')

# Download the training data to the local environment
train_path = train_data_art.download()

# Load the training dataset from the downloaded path
train_dataset = load_from_disk(train_path)

# Convert the training dataset to a pandas DataFrame for further processing
df = pd.DataFrame(train_dataset)

In [None]:
# Generate a histogram of the 'rating' column in the DataFrame
hist_plot = df['rating'].hist(bins=np.arange(7)-.5, rwidth=.5)

# Set the label for the x-axis
plt.xlabel('rating')

# Set the label for the y-axis
plt.ylabel('frequency')

# Ensure tight layout for better visualization
plt.tight_layout()

# Display the histogram plot
plt.show()

In [None]:
# Calculate the Spearman correlation matrix between 'rating', 'full_length', and 'mean_word_length'
corr_plot = df[['rating','full_length','mean_word_length']].corr(method='spearman')

# Log the correlation plot to WandB for tracking and visualization
wandb.log({'Correlation Plot':corr_plot})

# Display the correlation plot
corr_plot

In [None]:
import matplotlib.pyplot as plt

# Create a figure and axis object
fig, ax = plt.subplots(figsize=(8, 6))

# Loop through each rating group in the DataFrame
for label, DF in df.groupby('rating'):
    # Plot the value counts of 'full_length' and sort by index
    ax.plot(DF['full_length'].value_counts().sort_index(), label=str(label))

# Set the x-axis limits
ax.set_xlim([0, 400])

# Add a legend
plt.legend()

# Set labels for the x-axis and y-axis
ax.set_xlabel('Review Length')
ax.set_ylabel('Number of Reviews')

# Ensure the plot layout is tight
plt.tight_layout()

# Log the plot using wandb (make sure wandb is properly configured)
wandb.log({'Number of Reviews vs Length': plt})

# Display the figure
plt.show()

In [None]:
# Create a figure and axis object
fig, ax = plt.subplots(figsize=(8, 6))

# Create a list to store the data for each rating group
data = []

# Loop through each rating group in the DataFrame
for label, DF in df.groupby('rating'):
    # Append the review lengths to the data list
    data.append(list(DF['full_length'].values))
    plt.xlabel(label)

# Create a box plot with the data, labeled by ratings
ax.boxplot(data, labels=[0, 1, 2, 3, 4, 5], patch_artist=True)

# Set x and y labels
ax.set_xlabel('Rating')
ax.set_ylabel('Review Length')

# Show the figure
plt.show()

# Save the figure as 'box_plot_ratings.png'
plt.savefig('box_plot_ratings')

In [None]:
# Create an instance of the WordNetLemmatizer
lem = WordNetLemmatizer()

# Download the WordNet corpus (if not already downloaded)
nltk.download('wordnet')

# Download the Open Multilingual Wordnet (if not already downloaded)
nltk.download('omw-1.4')

In [None]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have",
                     "couldna": "could not have"}
# Regular expression for finding contractions
contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(text, contractions_dict=contractions_dict):
    """
    Expand contractions in the given text using the provided dictionary.

    Args:
        text (str): Input text containing contractions.
        contractions_dict (dict, optional): Dictionary mapping contractions to their expanded form. Default is contractions_dict.

    Returns:
        str: Text with expanded contractions.
    """
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

def clean_text(text):
    """
    Apply pre-processing steps on the given text.

    Steps:
    - Remove HTML tags
    - Remove punctuation
    - Convert to lowercase
    - Tokenize and lemmatize sentences

    Args:
        text (str): Input text to be pre-processed.

    Returns:
        str: Cleaned and processed text.
    """

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove the characters [\] and ["]
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\"", "", text)

    # Remove all non-ASCII characters
    text = re.sub(r'[^\x00-\x7f]', r'', text)

    # Remove URL
    text = re.sub(r"http\S+", "", text)

    # Replace punctuation characters with spaces
    filters = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = " ".join(text.split())
    return text

# Define a list of words to be considered as 'useless'
useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation) + ['``',"''",'...','a',"--","***",'http',"**"]

def review_tokenize(text):
    """
    Tokenize and apply stemming to the text, removing 'useless' words.

    Args:
        text (str): Input text to be tokenized and filtered.

    Returns:
        str: Processed text after tokenization and filtering.
    """
    stem = PorterStemmer()
    text = ' '.join([w for w in text.split() if w not in useless_words])
    text = "".join([stem.stem(w) for w in text])
    return text

def tokenize_filtered(text):
    """
    Apply a series of pre-processing steps to the text, including removing digits, cleaning, expanding contractions, tokenizing, and filtering.

    Args:
        text (str): Input text to be pre-processed.

    Returns:
        str: Cleaned and processed text.
    """
    x = re.sub(r'\d+', '', str(text))
    x = clean_text(x)
    x = expand_contractions(x)
    x = review_tokenize(x)

    return x


In [None]:
# Apply the function 'tokenize_filtered' to each element in the 'review_text' column and create a new column 'tok_review' with the processed text.
df['tok_review'] = df['review_text'].map(lambda x: tokenize_filtered(x))

# Calculate the polarity of each review using TextBlob's sentiment analysis. 
# The 'polarity' score ranges from -1 (negative sentiment) to 1 (positive sentiment).
df['polarity'] = df['tok_review'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
# Create a box plot using the seaborn library to visualize the distribution of 'polarity' values for each 'rating'.
sns.boxplot(data=df, y='polarity', x='rating')

# Save the generated plot as a file in the 'figures' directory with the name 'sns_plot_new'.
plt.savefig('figures/sns_plot_new')

In [None]:
# Initialize an empty list to store the number of reviews for each book.
counts = []

# Iterate through each group of reviews grouped by 'book_id'.
for label, DF in df.groupby(by='book_id'):
    # Append the number of reviews in each group to the 'counts' list.
    counts.append(len(DF))

# Create a histogram of the number of reviews, setting bins and range.
plt.hist(sorted(counts, reverse=True), bins=20, range=[0, 40])

# Set labels for the x-axis and y-axis.
plt.xlabel('Number of Reviews')
plt.ylabel('Frequency')

# Log the plot using wandb (make sure wandb is properly configured).
wandb.log({'Frequency of Number of Reviews': plt})

In [None]:
# Concatenate all the 'tok_review' values into a single string, separated by spaces.
r = ' '.join(list(df["tok_review"].values))

# Generate a word cloud with specific parameters for size, maximum words, and colormap.
wordcloud = WordCloud(width=400, height=400, max_words=200, colormap="Dark2").generate(r)  

# Create a new figure with specified dimensions.
plt.figure(figsize=(10, 8))

# Display the generated word cloud with specified interpolation method.
plt.imshow(wordcloud, interpolation='bilinear')

# Turn off axis for better visualization.
plt.axis("off")

# Ensure the layout is tight for a clean appearance.
plt.tight_layout()

In [None]:
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer

# Create a dictionary 'corpus' to store reviews grouped by ratings.
corpus = {}

# Iterate through groups of reviews based on 'rating'.
for label, DF in df.groupby('rating'):
    corpus[label] = list(DF['tok_review'])

def get_top_ngram(corpus, n=None):
    """
    Get the top n-grams from a corpus of text.

    Args:
        corpus (list): List of text documents.
        n (int, optional): The order of n-grams to consider (e.g., 1 for unigrams, 2 for bigrams). Default is None.

    Returns:
        list: List of top n-grams along with their frequencies.
    """
    # Create a vectorizer with specified n-gram range.
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    
    # Transform the corpus into a bag of words representation.
    bag_of_words = vec.transform(corpus)
    
    # Sum the occurrences of each word.
    sum_words = bag_of_words.sum(axis=0)
    
    # Create a list of tuples with words and their frequencies.
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    
    # Sort the list by frequency in descending order.
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    
    # Return the top 10 n-grams.
    return words_freq[:10]

In [None]:
# Loop through ratings from 0 to 5.
for i in range(0,6):
    # Create a new figure and axis for each rating.
    fig, ax = plt.subplots()

    # Get the top 20 bigrams for the corpus associated with the current rating.
    top_n_bigrams = get_top_ngram(corpus[i], 2)[:20]

    # Extract x and y values from the top n-grams.
    x, y = map(list, zip(*top_n_bigrams))

    # Create a bar plot using seaborn.
    sns.barplot(x=y, y=x)

    # Set labels and title for the plot.
    ax.set_xlabel('frequency')
    # ax.set_title(f'Top Bigrams at Rating = {i}')

    # Ensure plot layout is tight.
    plt.tight_layout()

    # Save the figure with a filename indicating the rating.
    plt.savefig(f'./bigrams_rating_{i}')

    # Display the plot.
    plt.show()

    # Log the plot as an image to Weights and Biases.
    wandb.log({f'Bigrams at rating {i}': wandb.Image(f'./bigrams_rating_{i}.png')})

In [None]:
# Loop through ratings from 0 to 5.
for i in range(0,6):
    # Create a new figure and axis for each rating.
    fig, ax = plt.subplots()

    # Get the top 20 trigrams for the corpus associated with the current rating.
    top_n_trigrams = get_top_ngram(corpus[i], 3)[:20]

    # Extract x and y values from the top n-grams.
    x, y = map(list, zip(*top_n_trigrams))

    # Create a bar plot using seaborn.
    sns.barplot(x=y, y=x)

    # Set labels and title for the plot.
    ax.set_xlabel('frequency')
    # ax.set_title(f'Top Trigrams at Rating = {i}')

    # Ensure plot layout is tight.
    plt.tight_layout()

    # Save the figure with a filename indicating the rating.
    plt.savefig(f'./trigrams_rating_{i}')

    # Display the plot.
    plt.show()

    # Log the plot as an image to Weights and Biases.
    wandb.log({f'trigrams at rating {i}': wandb.Image(f'./trigrams_rating_{i}.png')})

In [None]:
run.finish()

In [None]:
# Filter the dataframe to include only reviews containing the phrase 'in exchange for an honest review',
# and select the 'rating' column.
honest_df = df[df['review_text'].map(lambda x: 'in exchange for an honest review' in x)].rating

# Create a histogram plot for the filtered ratings.
hist_plot_honest = honest_df.hist(bins=np.arange(7)-.5, rwidth=.5)

# Set labels for the y-axis and x-axis.
plt.ylabel('Frequency')
plt.xlabel('Rating')

# Ensure plot layout is tight.
plt.tight_layout()

# Display the plot.
plt.show()

# Log the plot to Weights and Biases with a specific name.
wandb.log({"Appearance of 'exchange honest review'": hist_plot_honest})
wandb.finish()

In [None]:
# Initialize a Weights and Biases run with specific project and job type.
run = wandb.init(project=params.WANDB_PROJECT, entity=None, job_type='validating_split')

# Retrieve the latest version of the 'processed_data_artifact' artifact.
processed_data_art = run.use_artifact(f'{params.PROCESSED_DATA_ARTIFACT}:latest')

# Download the artifact to a local path.
processed_path = processed_data_art.download()

# Load the processed dataset from the downloaded path using Hugging Face's load_from_disk function.
processed_dataset = load_from_disk(processed_path)

# Create a DataFrame 'df' from the loaded dataset.
df = pd.DataFrame(processed_dataset)

df['user_id'].value_counts()

In [None]:
df['user_id'].value_counts().describe()

In [None]:
# Generate a histogram of the frequency of each user_id in the dataframe.
hist_plot = df['user_id'].value_counts().hist(bins=50)

# Set labels for the x-axis and y-axis.
plt.xlabel('user_id')
plt.ylabel('frequency')

# Ensure plot layout is tight.
plt.tight_layout()

# Display the plot.
plt.show()

# Save the figure with the specified filename.
plt.savefig('user_id_frequencies')

# Log the plot to Weights and Biases with a specific name.
wandb.log({"User ID Frequencies": hist_plot})

In [None]:
# Count the number of occurrences of each unique user_id.
user_id_vals = df['user_id'].value_counts()

# Print the number of user_ids with 5 or more occurrences and those with less than 5 occurrences.
print(len(user_id_vals[user_id_vals>=5]))
print(len(user_id_vals[user_id_vals<5]))

# Extract user_ids with 5 or more occurrences and those with less than 5 occurrences.
user_ids_more_than_five = user_id_vals[user_id_vals>=5].index
user_ids_less_than_five = user_id_vals[user_id_vals<5].index

# Filter the dataframe to include only rows with user_ids that have 5 or more occurrences.
df_more_than_five = df.loc[df['user_id'].isin(user_ids_more_than_five)]
df_less_than_five = df.loc[df['user_id'].isin(user_ids_less_than_five)]

# Import necessary module for stratified group k-fold cross-validation.
from sklearn.model_selection import StratifiedGroupKFold

# Initialize StratifiedGroupKFold with 5 splits.
sgkf = StratifiedGroupKFold(n_splits=5)

# Create an array 'groups' with book_id values.
groups = df_more_than_five['book_id'].to_numpy()

# Create an array 'y' with user_id values.
y = df_more_than_five['user_id'].to_numpy()

# Generate training and test indices using StratifiedGroupKFold.
train_idxs, test_idxs = next(iter(sgkf.split(np.arange(len(groups)), y, groups)))

# Define the file path for the training data.
train_path = '/content/drive/My Drive/WandB/assignment_1/data/raw_data/goodreads_train.csv'

# Load the training data into a new dataframe 'df'.
df = pd.read_csv(train_path)  

# Count the number of reviews for each user_id in the new dataframe.
user_review_counts = df['user_id'].value_counts()

# Extract user_ids with 5 or more reviews.
more_than_five = user_review_counts.loc[user_review_counts>=5].index

# Filter the dataframe to include only rows with user_ids that have 5 or more reviews.
df_more_than_5 = df.loc[df['user_id'].isin(more_than_five)]

# Generate a histogram of the ratings for the filtered dataframe.
hist_plot = df_more_than_5['rating'].hist(bins=np.arange(7)-.5, rwidth=.5)

# Set labels for the x-axis and y-axis.
plt.xlabel('rating')
plt.ylabel('frequency')

# Ensure plot layout is tight.
plt.tight_layout()

# Display the plot.
plt.show()

In [None]:
# Retrieve the counts of each unique value in the 'rating' column of the DataFrame 'df_more_than_5'.
rating_counts = df_more_than_5['rating'].value_counts()

# The resulting Series will show the frequency of each rating value.

In [None]:
run.finish()