In [None]:
#!pip install sentencepiece
#!pip install pyarrow==15.0.2
#!pip install datasets
#!pip install nlpaug
#!pip install evaluate
#!pip install optuna

# Naive Bayes Baseline

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import hashlib
from imblearn.over_sampling import RandomOverSampler

BASEPATH = "./drive/MyDrive/IsraelTransformer2"


def load_and_preprocess_data():
    df_prev = pd.concat([pd.read_csv(f"./drive/MyDrive/IsraelTransformer/ManualCoding{i}.csv") for i in range(1, 4)], ignore_index=True)
    df_prev = df_prev[df_prev["stance"].isin(["-1", "1", "0"])]
    label_map = {'1': 2, '-1': 0, '0': 1}
    df_prev["stance"] = df_prev["stance"].map(lambda x: label_map[str(x)])
    df_prev = df_prev.sample(frac=1, random_state=42).reset_index(drop=True)
    df_prev["cid_entry"] = df_prev["text"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
    df_prev.rename(columns={"stance": "labels"}, inplace=True)
    df_new = pd.read_csv(f"{BASEPATH}/df_final.csv")
    df_new["labels"] = df_new["labels"] + 1
    df_prev = df_prev.loc[:,["cid_entry","text", "labels"]]
    df_new = df_new.loc[:,["cid_entry","text", "labels"]]
    df = pd.concat([df_prev, df_new], ignore_index=True)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Split the data
    # train_test_split returns a tuple of arrays, we need to unpack it correctly
    train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['labels'], random_state=42)

    # Implement oversampling for the training data
    oversampler = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(
        train_df[['cid_entry', 'text']], train_df['labels']
    )
    train_df_resampled = pd.DataFrame(X_train_resampled, columns=['cid_entry', 'text'])
    train_df_resampled['labels'] = y_train_resampled

    print("Original class distribution:")
    print(train_df['labels'].value_counts(normalize=True))
    print("\nResampled class distribution:")
    print(train_df_resampled['labels'].value_counts(normalize=True))

    return train_df_resampled, val_df, test_df

def train_and_evaluate_naive_bayes(df):
    train_df, val_df, test_df = df
    # Vectorize the text data
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_df['text'])
    X_val = vectorizer.transform(val_df['text'])
    X_test = vectorizer.transform(test_df['text'])

    # Apply oversampling to the training data
    oversampler = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, train_df['labels'])

    print("Original class distribution:")
    print(train_df['labels'].value_counts(normalize=True))
    print("\nResampled class distribution:")
    print(pd.Series(y_train_resampled).value_counts(normalize=True))

    # Train the Naive Bayes classifier on the resampled data
    clf = MultinomialNB()
    clf.fit(X_train_resampled, y_train_resampled)

    # Evaluate on validation set
    val_preds = clf.predict(X_val)
    val_results = {
        'balanced_accuracy': balanced_accuracy_score(val_df['labels'], val_preds),
        'macro_f1': f1_score(val_df['labels'], val_preds, average='macro'),
        'weighted_f1': f1_score(val_df['labels'], val_preds, average='weighted')
    }

    print("Validation Results for Naive Bayes:")
    for key, value in val_results.items():
        print(f"{key}: {value:.4f}")

    # Final evaluation on the test set
    test_preds = clf.predict(X_test)

    print("Test Set Classification Report for Naive Bayes:")
    print(classification_report(test_df['labels'], test_preds, digits=3))

    # Save the classification report
    report = classification_report(test_df['labels'], test_preds, digits=3, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv(f'{BASEPATH}/classification_report_naive_bayes_oversampled.csv', index=True)

    # Save the confusion matrix
    cm = confusion_matrix(test_df['labels'], test_preds)
    cm_df = pd.DataFrame(cm, columns=['Predicted Palestine', 'Predicted Neutral', 'Predicted Israel'],
                         index=['Actual Palestine', 'Actual Neutral', 'Actual Israel'])
    cm_df.to_csv(f'{BASEPATH}/confusion_matrix_naive_bayes_oversampled.csv', index=True)

def main():
    df = load_and_preprocess_data()
    print("Training and evaluating Naive Bayes classifier with oversampling")
    train_and_evaluate_naive_bayes(df)
    print("Training and evaluation complete for Naive Bayes baseline with oversampling.")

if __name__ == "__main__":
    main()

# Train Transformer

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate
import os
import torch
import matplotlib.pyplot as plt
import shutil
import hashlib
import optuna
from transformers import (TrainerCallback, AutoModelForSequenceClassification, AutoTokenizer,
                          TrainingArguments, Trainer, EarlyStoppingCallback, get_linear_schedule_with_warmup, AdamW, AutoConfig)
from imblearn.over_sampling import RandomOverSampler

BASEPATH = "./drive/MyDrive/IsraelTransformer2"
model_name = "xlm-roberta-large"
RSEED = 0

class CustomCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.train_losses = []
        self.val_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            self.train_losses.append(logs['loss'])
        if 'eval_loss' in logs:
            self.val_losses.append(logs['eval_loss'])

def load_and_preprocess_data():
    df_prev = pd.concat([pd.read_csv(f"./drive/MyDrive/IsraelTransformer/ManualCoding{i}.csv") for i in range(1, 4)], ignore_index=True)
    df_prev = df_prev[df_prev["stance"].isin(["-1", "1", "0"])]
    label_map = {'1': 2, '-1': 0, '0': 1}
    df_prev["stance"] = df_prev["stance"].map(lambda x: label_map[str(x)])
    df_prev = df_prev.sample(frac=1, random_state=RSEED).reset_index(drop=True)
    df_prev["cid_entry"] = df_prev["text"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
    df_prev.rename(columns={"stance": "labels"}, inplace=True)
    df_new = pd.read_csv(f"{BASEPATH}/df_final.csv")
    df_new["labels"] = df_new["labels"] + 1
    df_prev = df_prev.loc[:,["cid_entry","text", "labels"]]
    df_new = df_new.loc[:,["cid_entry","text", "labels"]]
    df = pd.concat([df_prev, df_new], ignore_index=True)
    df = df.sample(frac=1, random_state=RSEED).reset_index(drop=True)

    # Split the data
    train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=RSEED)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['labels'], random_state=RSEED)

    # Implement oversampling for the training data
    oversampler = RandomOverSampler(random_state=RSEED)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(
        train_df[['cid_entry', 'text']], train_df['labels']
    )
    train_df_resampled = pd.DataFrame(X_train_resampled, columns=['cid_entry', 'text'])
    train_df_resampled['labels'] = y_train_resampled

    print("Original class distribution:")
    print(train_df['labels'].value_counts(normalize=True))
    print("\nResampled class distribution:")
    print(train_df_resampled['labels'].value_counts(normalize=True))

    return train_df_resampled, val_df, test_df

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'balanced_accuracy': balanced_accuracy_score(labels, predictions),
        'macro_f1': f1_score(labels, predictions, average='macro'),
        'weighted_f1': f1_score(labels, predictions, average='weighted')
    }

def prepare_datasets(train_df, val_df, test_df, tokenizer):
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    def tokenize_and_format(examples):
        tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128, return_tensors="pt")
        return {
            'input_ids': tokenized['input_ids'].squeeze(),
            'attention_mask': tokenized['attention_mask'].squeeze(),
            'labels': torch.tensor(examples['labels'])
        }

    train_dataset = train_dataset.map(tokenize_and_format, batched=True, batch_size=8, remove_columns=['cid_entry', 'text'])
    val_dataset = val_dataset.map(tokenize_and_format, batched=True, batch_size=8, remove_columns=['cid_entry', 'text'])
    test_dataset = test_dataset.map(tokenize_and_format, batched=True, batch_size=8, remove_columns=['cid_entry', 'text'])

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    print("Train dataset size:", len(train_dataset))
    print("Validation dataset size:", len(val_dataset))
    print("Test dataset size:", len(test_dataset))

    return train_dataset, val_dataset, test_dataset
def objective(trial):
    global best_f1, best_model_path, best_results, best_report, best_cm

    config = AutoConfig.from_pretrained(model_name)
    config.num_labels = 3

    # Hyperparameters to tune
    config.hidden_dropout_prob = trial.suggest_float("hidden_dropout_prob", 0.1, 0.5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-4)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-3, 1e-1)
    warmup_ratio = trial.suggest_uniform("warmup_ratio", 0.05, 0.2)
    num_frozen_layers = trial.suggest_int("num_frozen_layers", 20, 24)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    # Freeze layers
    for layer in model.roberta.encoder.layer[:num_frozen_layers]:
        for param in layer.parameters():
            param.requires_grad = False
    for param in model.roberta.embeddings.parameters():
        param.requires_grad = False

    training_args = TrainingArguments(
        output_dir=f'./results_{model_name.split("/")[-1]}_{trial.number}',
        num_train_epochs=20,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_ratio=warmup_ratio,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        logging_dir=f'./logs_{model_name.split("/")[-1]}_{trial.number}',
        logging_steps=5,
        evaluation_strategy='steps',
        eval_steps=100,
        save_strategy='steps',
        save_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model='macro_f1',
        greater_is_better=True,
        fp16=True,
        gradient_accumulation_steps=2,
        max_grad_norm=1.0,
    )

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=AdamW(model.parameters(), lr=training_args.learning_rate),
        num_warmup_steps=int(warmup_ratio * (len(train_dataset) / training_args.per_device_train_batch_size) * training_args.num_train_epochs),
        num_training_steps=(len(train_dataset) / training_args.per_device_train_batch_size) * training_args.num_train_epochs
    )

    custom_callback = CustomCallback()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5), custom_callback],
        optimizers=(AdamW(model.parameters(), lr=learning_rate), lr_scheduler),
    )

    trainer.train()

    # Evaluate on validation set
    val_results = trainer.evaluate()
    val_f1 = val_results['eval_macro_f1']

    # Also get test results for final reporting
    test_results = trainer.predict(test_dataset)
    test_preds = np.argmax(test_results.predictions, axis=-1)
    test_f1 = f1_score(test_dataset['labels'], test_preds, average='macro')

    print(f"Trial {trial.number} - Validation Macro F1: {val_f1:.4f}, Test Macro F1: {test_f1:.4f}")

    if val_f1 > best_f1:
        print(f"New best model found! Validation F1: {val_f1:.4f}")
        best_f1 = val_f1

        # Save the new best model
        new_best_model_path = f'{BASEPATH}/best_model_{model_name.split("/")[-1]}_{trial.number}'
        trainer.save_model(new_best_model_path)

        # Delete the old best model if it exists
        if best_model_path and os.path.exists(best_model_path):
            shutil.rmtree(best_model_path)

        best_model_path = new_best_model_path

        # Save best results
        best_results = {
            'test_f1': test_f1,
            'val_results': val_results,
            'params': trial.params,
            'train_losses': custom_callback.train_losses,
            'val_losses': custom_callback.val_losses
        }

        # Generate and save classification report
        best_report = classification_report(test_dataset['labels'], test_preds, digits=3, output_dict=True)

        # Generate and save confusion matrix
        best_cm = confusion_matrix(test_dataset['labels'], test_preds)

    # Clean up to save disk space
    if os.path.exists(training_args.output_dir):
        shutil.rmtree(training_args.output_dir)
    if os.path.exists(training_args.logging_dir):
        shutil.rmtree(training_args.logging_dir)

    return val_f1

def run_hyperparameter_tuning():
    global best_f1, best_model_path, best_results, best_report, best_cm
    best_f1 = 0.0
    best_model_path = None
    best_results = None
    best_report = None
    best_cm = None

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    print("Best trial:")
    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    return study.best_trial

def main():
    global train_dataset, val_dataset, test_dataset
    train_df, val_df, test_df = load_and_preprocess_data()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset, val_dataset, test_dataset = prepare_datasets(train_df, val_df, test_df, tokenizer)

    print("Starting hyperparameter tuning...")
    best_trial = run_hyperparameter_tuning()

    print(f"Best model saved at: {best_model_path}")
    print(f"Best Test Macro F1: {best_f1:.4f}")

    # Save the best results
    print("Saving best model results...")

    # Save validation results
    print(f"Validation Results for {model_name}:")
    for key, value in best_results['val_results'].items():
        print(f"{key}: {value:.4f}")

    # Save the classification report
    report_df = pd.DataFrame(best_report).transpose()
    report_df.to_csv(f'{BASEPATH}/classification_report_{model_name.split("/")[-1]}_best.csv', index=True)

    # Save the confusion matrix
    cm_df = pd.DataFrame(best_cm, columns=['Predicted Palestine', 'Predicted Neutral', 'Predicted Israel'],
                         index=['Actual Palestine', 'Actual Neutral', 'Actual Israel'])
    cm_df.to_csv(f'{BASEPATH}/confusion_matrix_{model_name.split("/")[-1]}_best.csv', index=True)

    # Plot and save learning curves
    plt.figure(figsize=(10, 5))
    plt.plot(best_results['train_losses'], label='Training Loss')
    plt.plot(best_results['val_losses'], range(0, len(best_results['val_losses']) * 100, 100), label='Validation Loss')
    plt.title(f'Learning Curves - {model_name} (Best Model)')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(f'{BASEPATH}/learning_curves_{model_name.split("/")[-1]}_best.png')
    plt.close()

    # Save best hyperparameters
    with open(f'{BASEPATH}/best_hyperparameters_{model_name.split("/")[-1]}.txt', 'w') as f:
        for key, value in best_trial.params.items():
            f.write(f"{key}: {value}\n")

    print("Training and evaluation complete. All results for the best model have been saved.")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import torch
import os
import glob
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

BASEPATH = "./drive/MyDrive/IsraelTransformer2"
model_name = "xlm-roberta-large"
best_model_path = f'{BASEPATH}/best_model_xlm-roberta-large_21'  # Path to the best model

# Create Labelling folder if it doesn't exist
labelling_folder = "./Labelling"
os.makedirs(labelling_folder, exist_ok=True)

# Load the best model and tokenizer
config = AutoConfig.from_pretrained(best_model_path)
model = AutoModelForSequenceClassification.from_pretrained(best_model_path, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load tokenizer from the original model name

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def tokenize_and_encode(texts, tokenizer, max_length=128):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

def predict_stance(model, encoded_texts, device):
    input_ids = encoded_texts['input_ids'].to(device)
    attention_mask = encoded_texts['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

    return predictions.cpu().numpy()

def process_batch(batch_df, model, tokenizer, device):
    encoded_texts = tokenize_and_encode(batch_df['text'].tolist(), tokenizer)
    predictions = predict_stance(model, encoded_texts, device)

    # Map predictions to labels
    label_map = {0: 'Palestine', 1: 'Neutral', 2: 'Israel'}
    batch_df['predicted_stance'] = [label_map[pred] for pred in predictions]

    return batch_df

def main():
    # Load the data to label
    data_to_label = pd.read_csv("./drive/MyDrive/IsraelTransformer2/matching_posts.csv.gz")
    data_to_label = data_to_label.loc[:,["cid_entry","text"]]

    batch_size = 1_000
    num_batches = len(data_to_label) // batch_size + (1 if len(data_to_label) % batch_size != 0 else 0)

    all_labelled_dfs = []

    for i in tqdm(range(num_batches), desc="Processing batches"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(data_to_label))

        batch_df = data_to_label.iloc[start_idx:end_idx].copy()

        # Check if this batch has already been labelled
        batch_filename = f"{labelling_folder}/labelled_batch_{i+1}.csv.gz"
        if os.path.exists(batch_filename):
            print(f"Batch {i+1} already labelled. Skipping...")
            labelled_batch = pd.read_csv(batch_filename, compression='gzip')
            all_labelled_dfs.append(labelled_batch)
            continue

        # Process the batch
        labelled_batch = process_batch(batch_df, model, tokenizer, device)

        # Save the labelled batch
        labelled_batch.to_csv(batch_filename, index=False, compression='gzip')
        all_labelled_dfs.append(labelled_batch)

    # Concatenate all labelled dataframes
    final_df = pd.concat(all_labelled_dfs, ignore_index=True)

    # Export the final concatenated dataframe
    final_df.to_csv(f"{labelling_folder}/all_labelled_data.csv.gz", index=False, compression='gzip')

    print("Labelling complete. All data has been processed and exported.")

if __name__ == "__main__":
    main()