In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install emoji langdetect transformers[torch]



In [None]:
# %% Import libraries
import bz2
import numpy as np
import pandas as pd
import re
import nltk
import emoji
from nltk.corpus import stopwords
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import pickle
import os
import torch
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Download necessary NLTK data
nltk.download('stopwords')

# Ensure consistent results with langdetect
DetectorFactory.seed = 0


Using device: cuda


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# %% Load and pre-process dataset
def preprocess_data(sample_size):
    # Load and balance the dataset
    file_path = r'/content/drive/MyDrive/Colab Notebooks/AmazonReviews/train.ft.txt.bz2'
    positive_reviews = []
    negative_reviews = []

    with bz2.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            label, text = line.split(' ', 1)
            label = int(label[-1])
            if label == 1 and len(negative_reviews) < sample_size // 2:
                negative_reviews.append([label, text])
            elif label == 2 and len(positive_reviews) < sample_size // 2:
                positive_reviews.append([label, text])
            if len(positive_reviews) == sample_size // 2 and len(negative_reviews) == sample_size // 2:
                break

    # Combine positive and negative reviews
    data = positive_reviews + negative_reviews
    df = pd.DataFrame(data, columns=['label', 'text'])

    # Function to detect language
    def detect_language(text):
        try:
            return detect(text)
        except LangDetectException:
            return "unknown"

    # Detect language
    df['language'] = df['text'].apply(detect_language)

    # Filter out non-English reviews
    df = df[df['language'] == 'en']
    df = df.drop(columns=['language'])

    # Initialize stopwords
    stop_words = set(stopwords.words('english'))

    def clean_text(text):
        # Replace emojis with descriptive words
        text = emoji.demojize(text, delimiters=(" ", " "))
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        # Remove hashtags
        text = re.sub(r'#\w+', '', text)
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        # Remove stopwords
        text = ' '.join(word for word in text.split() if word not in stop_words)
        return text

    # Apply text cleaning to the 'text' column
    df['text'] = df['text'].apply(clean_text)

    df['label'] = df['label'] - 1

    return df

SAMPLE_SIZE = 20_000
processed_data_file = r'/content/drive/MyDrive/Colab Notebooks/AmazonReviews/processed_data.pkl'

if os.path.exists(processed_data_file):
    with open(processed_data_file, 'rb') as file:
        df = pickle.load(file)
else:
    df = preprocess_data(SAMPLE_SIZE)
    with open(processed_data_file, 'wb') as file:
        pickle.dump(df, file)

print(df.head())


   label                                               text
0      1  stuning even nongamer sound track beautiful pa...
1      1  best soundtrack ever anything im reading lot r...
2      1  amazing soundtrack favorite music time hands i...
3      1  excellent soundtrack truly like soundtrack enj...
4      1  remember pull jaw floor hearing youve played g...


In [None]:
# %% Tokenization with DistilBERT
tokenized_data_file = r'/content/drive/MyDrive/Colab Notebooks/AmazonReviews/tokenized_data.pkl'

if os.path.exists(tokenized_data_file):
    with open(tokenized_data_file, 'rb') as file:
        train_encodings, test_encodings, y_train, y_test = pickle.load(file)
else:
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    # Tokenize the texts in the dataset
    def tokenize_texts(texts, max_length=512):
        return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

    # Split the data into training and test sets
    labels = df['label'].values
    texts = df['text'].tolist()

    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

    # Tokenize the datasets
    train_encodings = tokenize_texts(X_train)
    test_encodings = tokenize_texts(X_test)

    # Save tokenized data
    with open(tokenized_data_file, 'wb') as file:
        pickle.dump((train_encodings, test_encodings, y_train, y_test), file)


In [None]:
# %% Custom Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create instances of the custom dataset
train_dataset = CustomDataset(train_encodings, y_train)
eval_dataset = CustomDataset(test_encodings, y_test)


In [None]:
# %% Define and train DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)
model_state_file = r'/content/drive/MyDrive/Colab Notebooks/AmazonReviews/distilbert_model_state.pth'

if os.path.exists(model_state_file):
    print("Loading fine-tuned model from state file...")
    model.load_state_dict(torch.load(model_state_file))
else:
    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Train the model
    trainer.train()

    # Save the trained model
    torch.save(model.state_dict(), model_state_file)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
500,0.44
1000,0.3135
1500,0.2115
2000,0.1924
2500,0.0959


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
# %% Evaluate the fine-tuned DistilBERT model
trainer.evaluate()

# Get predictions
predictions = trainer.predict(eval_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Evaluation metrics
print("DistilBERT:")
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


DistilBERT:
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      1991
           1       0.91      0.90      0.91      2000

    accuracy                           0.91      3991
   macro avg       0.91      0.91      0.91      3991
weighted avg       0.91      0.91      0.91      3991

[[1816  175]
 [ 202 1798]]
