# Installs and Imports

In [None]:
!pip install pandas transformers datasets scikit-learn torch transformers[torch]

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification
from tqdm import tqdm

# Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Dataset Loading

In [None]:
annotated_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/INITIAL_ANNOTATED_SAMPLE.xlsx'
cleaned_dataset_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/CLEANED_PREPROCESSED_DATA_04.xlsx'

In [None]:
df_annotated_nd = pd.read_excel(annotated_path, sheet_name='ND')
df_annotated_dn = pd.read_excel(annotated_path, sheet_name='DN')
df_cleaned_nd = pd.read_excel(cleaned_dataset_path, sheet_name='ND')
df_cleaned_dn = pd.read_excel(cleaned_dataset_path, sheet_name='DN')

In [None]:
annotated_texts_nd = set(df_annotated_nd['text'].tolist())
annotated_texts_dn = set(df_annotated_dn['text'].tolist())
df_cleaned_nd = df_cleaned_nd[~df_cleaned_nd['text'].isin(annotated_texts_nd)]
df_cleaned_dn = df_cleaned_dn[~df_cleaned_dn['text'].isin(annotated_texts_dn)]

In [None]:
sample_size = 1000  # Adjust the sample size as needed
df_cleaned_sample_nd = df_cleaned_nd.sample(n=sample_size, random_state=42).reset_index(drop=True)
df_cleaned_sample_dn = df_cleaned_dn.sample(n=sample_size, random_state=42).reset_index(drop=True)

In [None]:
def preprocess_data(texts, tokenizer, max_length=512):
    inputs = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return inputs

In [None]:
tokenizer_xlm = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
model_paths = {
    'ND': '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/Updated2_Model_ND',
    'DN': '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/Updated2_Model_DN'
}

In [None]:
models = {category: XLMRobertaForSequenceClassification.from_pretrained(path).to('cuda') for category, path in model_paths.items()}

# Pseudolabel Generation

In [None]:
def generate_pseudolabels_and_uncertainty(df, model, tokenizer, batch_size=32):
    pseudolabels = []
    uncertainties = []
    model.eval()
    for i in tqdm(range(0, len(df), batch_size), desc="Generating pseudolabels"):
        batch_texts = df['text'].iloc[i:i+batch_size].tolist()
        inputs = preprocess_data(batch_texts, tokenizer)
        inputs = {key: val.to('cuda') for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, axis=1).cpu().numpy()
        confidence_scores = torch.softmax(outputs.logits, dim=1).cpu().numpy()
        uncertainty_scores = 1 - confidence_scores.max(axis=1)
        pseudolabels.extend(predictions)
        uncertainties.extend(uncertainty_scores)
    return pseudolabels, uncertainties

In [None]:
df_cleaned_sample_nd['pseudolabel'], df_cleaned_sample_nd['uncertainty'] = generate_pseudolabels_and_uncertainty(df_cleaned_sample_nd, models['ND'], tokenizer_xlm)
df_cleaned_sample_dn['pseudolabel'], df_cleaned_sample_dn['uncertainty'] = generate_pseudolabels_and_uncertainty(df_cleaned_sample_dn, models['DN'], tokenizer_xlm)

In [None]:
label_mapping = {0: 'Very negative', 1: 'Negative', 2: 'Slightly negative', 3: 'Neutral', 4: 'Slightly positive', 5: 'Positive', 6: 'Very positive'}
df_cleaned_sample_nd['categorical_pseudolabel'] = df_cleaned_sample_nd['numerical_pseudolabel'].map(label_mapping)
df_cleaned_sample_dn['categorical_pseudolabel'] = df_cleaned_sample_dn['numerical_pseudolabel'].map(label_mapping)

In [None]:
output_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/PSEUDOLABELED_SAMPLE_WITH_UNCERTAINTY.xlsx'
with pd.ExcelWriter(output_path) as writer:
    df_cleaned_sample_nd.to_excel(writer, sheet_name='ND', index=False)
    df_cleaned_sample_dn.to_excel(writer, sheet_name='DN', index=False)
