In [None]:
import os
import random
import sys

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

sys.path.append(parent_dir)

DEV_SET_FRAC = 0.001
OVERWRITE_SETS_WITH_DEV = True
os.environ["WANDB_PROJECT"]="lang-based-yappers/amazon_sentiment_analysis"
SEED = 1337

In [None]:
import random

import numpy as np
import torch

torch.use_deterministic_algorithms(True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
print('Device:', device)

def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
        torch.cuda.manual_seed_all(SEED)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed()

In [None]:
from datasets import load_dataset

dataset = load_dataset("amazon_polarity")
dataset

In [None]:
train_ds, test_ds = dataset['train'], dataset['test']
train_df, test_df = train_ds.to_pandas(), test_ds.to_pandas()

In [None]:
train_df_dev = train_df.sample(frac=DEV_SET_FRAC, random_state=1337)
test_df_dev = test_df.sample(frac=DEV_SET_FRAC, random_state=1337)

if OVERWRITE_SETS_WITH_DEV:
    train_df = train_df_dev
    test_df = test_df_dev
    
    train_ds = train_ds.select(range(len(train_df)))
    test_ds = test_ds.select(range(len(test_df)))

y_train, y_test = train_df['label'], test_df['label']
print(f"Training data shape: {train_df_dev.shape}, Testing data shape: {test_df_dev.shape}")

# Embedding

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


def fit_tf_idf_vectorizer(train_df, test_df):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    X_train_tfidf = vectorizer.fit_transform(train_df['content'])
    X_test_tfidf = vectorizer.transform(test_df['content'])
    return X_train_tfidf, X_test_tfidf


X_train_tfidf, X_test_tfidf = fit_tf_idf_vectorizer(train_df, test_df)

## BERT with Average Pooling

In [None]:
from transformers import AutoTokenizer, BertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", force_download=True)

model = BertModel.from_pretrained("google-bert/bert-base-uncased", force_download=True)
model.to(device)


def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state
        return last_hidden_states


def get_sentence_embedding(text):
    with torch.no_grad():
        last_hidden_states = get_bert_embedding(text)
        return torch.mean(last_hidden_states, dim=1).squeeze()


def fit_bert_vectorizer(train_df, test_df):
    X_train_bert = torch.stack([get_sentence_embedding(text) for text in train_df['content']])
    X_test_bert = torch.stack([get_sentence_embedding(text) for text in test_df['content']])
    return X_train_bert, X_test_bert

#X_train_bert, X_test_bert = fit_bert_vectorizer(train_df, test_df)

## Sentence Transformers


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
model.to(device)


def generate_embeddings(texts):
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings


# Generate embeddings for a small subset due to computational limits
X_train_sentence_transformer = generate_embeddings(train_df['content'].values)
X_test_sentence_transformer = generate_embeddings(test_df['content'].values)


## KNeighbors Weak Labeling

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=40)
knn.fit(X_train_sentence_transformer, y_train)

In [None]:
y_pred = knn.predict(X_test_sentence_transformer)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(test_df['label'], y_pred))
print(f"Accuracy: {accuracy_score(test_df['label'], y_pred)}")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid,
                           n_jobs=-1,
                           verbose=3)

grid_search.fit(X_train_sentence_transformer, y_train)

In [None]:
import os
import pickle

MODELS_FOLDER = '../models'

os.makedirs(MODELS_FOLDER, exist_ok=True)

with open(f'{MODELS_FOLDER}/wl_knn.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

In [None]:
with open(f'{MODELS_FOLDER}/wl_knn.pkl', 'rb') as f:
    knn = pickle.load(f)

In [None]:
print('--- Best KNN Parameters ---')
print(f'Parameters: {grid_search.best_params_}')
print(f'Score: {grid_search.best_score_:.2f}')

knn_best = knn.best_estimator_
knn_best.fit(X_train_sentence_transformer, y_train)
y_pred = knn_best.predict(X_test_sentence_transformer)

In [None]:
from src.px_utils import create_dataset, launch_px

test_ds = create_dataset("Test Dataset", test_df, X_test_sentence_transformer.tolist(), y_pred)
train_ds = create_dataset("Train Dataset", train_df, X_train_sentence_transformer.tolist(), y_train)

launch_px(test_ds, train_ds)

## Logistic Regression Weak Labelling

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_sentence_transformer, y_train)

In [None]:
logreg_pred = logreg.predict(X_test_sentence_transformer)

print(classification_report(test_df['label'], logreg_pred))