# Piotr Bielecki, Filip Kucia, Adam Czerwoński

# Second Part - more advanced techniques

## TextBlob

In [5]:
from datasets import load_dataset
from textblob import TextBlob
from sklearn.metrics import classification_report
import pandas as pd

# Load the IMDb dataset (train and test split)
train_ds = load_dataset("stanfordnlp/imdb", split='train')
test_ds = load_dataset("stanfordnlp/imdb", split='test')

# Convert to DataFrame for easier manipulation
test_df = pd.DataFrame(test_ds)

# Apply TextBlob sentiment analysis on the test dataset
def get_textblob_sentiment(text):
    blob = TextBlob(text)
    return 1 if blob.sentiment.polarity > 0 else 0

# TextBlob predictions on test data
test_df['textblob_sentiment'] = test_df['text'].apply(get_textblob_sentiment)

# Get the true labels
true_labels = test_df['label']

# Generate classification report for TextBlob
textblob_report = classification_report(true_labels, test_df['textblob_sentiment'], target_names=['Negative', 'Positive'])

# Output the results
print("TextBlob Classification Report:\n", textblob_report)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

TextBlob Classification Report:
               precision    recall  f1-score   support

    Negative       0.89      0.43      0.59     12500
    Positive       0.63      0.95      0.75     12500

    accuracy                           0.69     25000
   macro avg       0.76      0.69      0.67     25000
weighted avg       0.76      0.69      0.67     25000



In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:0

## TDIFD and Multinomial Naive Bayes

In [6]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import pandas as pd

# Convert to DataFrame for easier manipulation
train_df = pd.DataFrame(train_ds)
test_df = pd.DataFrame(test_ds)

# Create a pipeline with TF-IDF vectorization and Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the Naive Bayes model
model.fit(train_df['text'], train_df['label'])

# Make predictions on the test set
test_predictions = model.predict(test_df['text'])

# Generate the classification report
nb_report = classification_report(test_df['label'], test_predictions, target_names=['Negative', 'Positive'])

# Output the results
print("Naive Bayes Classification Report:\n", nb_report)

Naive Bayes Classification Report:
               precision    recall  f1-score   support

    Negative       0.79      0.89      0.84     12500
    Positive       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



## Fasttext

In [10]:
from datasets import load_dataset
from sklearn.metrics import classification_report
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the IMDb dataset (train and test split)
train_ds = load_dataset("stanfordnlp/imdb", split='train')
test_ds = load_dataset("stanfordnlp/imdb", split='test')

# Convert to DataFrames
train_df = pd.DataFrame(train_ds)
test_df = pd.DataFrame(test_ds)

# Split train data into training and validation sets
train_df, valid_df = train_test_split(
    train_df, test_size=0.1, random_state=42, stratify=train_df['label']
)

# Prepare data for FastText
def prepare_fasttext_data(df, path):
    df_fasttext = df[['text', 'label']].copy()
    df_fasttext['label'] = df_fasttext['label'].apply(lambda x: f'__label__{x}')
    df_fasttext[['label', 'text']].to_csv(
        path, index=False, header=False, sep=' ', quoting=3, escapechar='\\'
    )

# Save the training data
fasttext_train_data_path = "fasttext_train_data.txt"
prepare_fasttext_data(train_df, fasttext_train_data_path)

# Save the validation data
fasttext_valid_data_path = "fasttext_valid_data.txt"
prepare_fasttext_data(valid_df, fasttext_valid_data_path)

# Train the FastText model with autotune to prevent overfitting
fasttext_model = fasttext.train_supervised(
    input=fasttext_train_data_path,
    autotuneValidationFile=fasttext_valid_data_path,
    autotuneDuration=300  # Adjust the duration as needed
)

# FastText predictions on the test data
test_df['fasttext_sentiment'] = test_df['text'].apply(
    lambda x: int(fasttext_model.predict(x)[0][0].replace('__label__', ''))
)

# Get the true labels
true_labels = test_df['label']

# Generate classification report
fasttext_report = classification_report(
    true_labels, test_df['fasttext_sentiment'], target_names=['Negative', 'Positive']
)

# Output the results
print("\nFastText Classification Report:\n", fasttext_report)



FastText Classification Report:
               precision    recall  f1-score   support

    Negative       0.60      0.60      0.60     12500
    Positive       0.60      0.60      0.60     12500

    accuracy                           0.60     25000
   macro avg       0.60      0.60      0.60     25000
weighted avg       0.60      0.60      0.60     25000



# FastText cleansed

In [1]:
# Required imports
from datasets import load_dataset
from sklearn.metrics import classification_report
import fasttext
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import tempfile

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]

    # Rejoin tokens to create the cleaned text
    return ' '.join(tokens)

# Load the IMDb dataset (train and test split)
train_ds = load_dataset("stanfordnlp/imdb", split='train')
test_ds = load_dataset("stanfordnlp/imdb", split='test')

# Convert to DataFrames
train_df = pd.DataFrame(train_ds)
test_df = pd.DataFrame(test_ds)

# Apply text preprocessing to train and test datasets
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

# Prepare data for FastText format
def prepare_fasttext_format(df):
    return df[['text', 'label']].apply(
        lambda row: f"__label__{row['label']} {row['text']}", axis=1
    ).tolist()

# Prepare training data in FastText format
train_fasttext_data = prepare_fasttext_format(train_df)

# Write the training data to a temporary file
with tempfile.NamedTemporaryFile(delete=False) as temp_train_file:
    temp_train_file_name = temp_train_file.name
    with open(temp_train_file_name, 'w') as f:
        for line in train_fasttext_data:
            f.write(f"{line}\n")

# Train the FastText model using the training dataset
fasttext_model = fasttext.train_supervised(
    input=temp_train_file_name,  # Provide the temporary file path
    lr=0.1,                     # Learning rate
    epoch=25,                   # Number of epochs
    wordNgrams=2,               # Use bigrams
    dim=100,                    # Embedding size
    minCount=5                  # Ignore words that appear less than 5 times
)

# FastText predictions on the test data
test_df['fasttext_sentiment'] = test_df['text'].apply(
    lambda x: int(fasttext_model.predict(x)[0][0].replace('__label__', ''))
)

# Get the true labels
true_labels = test_df['label']

# Generate and print the classification report
fasttext_report = classification_report(
    true_labels, test_df['fasttext_sentiment'], target_names=['Negative', 'Positive']
)

print("\nFastText Classification Report:\n", fasttext_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



FastText Classification Report:
               precision    recall  f1-score   support

    Negative       0.92      0.76      0.83     12500
    Positive       0.79      0.93      0.86     12500

    accuracy                           0.84     25000
   macro avg       0.86      0.84      0.84     25000
weighted avg       0.86      0.84      0.84     25000



## Transformer Based