In [None]:
# =============================================================================
# 1. IMPORT LIBRARIES
# =============================================================================
import os
import logging
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from transformers import RobertaModel, RobertaTokenizer
import sys

current_dir = os.getcwd()
two_steps_back = os.path.dirname(os.path.dirname(current_dir))
sys.path.append(two_steps_back)
from RoBERTa_classifier import RobertaTrainer, RobertaTokenizer

logging.basicConfig(level=logging.ERROR)

# =============================================================================
# 2. CONSTANTS & VARIABLES
# =============================================================================
# Device configuration
DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Data paths
BASE_DIR = os.path.dirname(os.getcwd())
TWO_FOLDERS_BACK = os.path.dirname(os.path.dirname(os.getcwd()))
DATA_FOLDER = os.path.join(TWO_FOLDERS_BACK, 'data')
DATA_PATH = os.path.join(DATA_FOLDER, 'MMCoVaR_News_Dataset.csv')

# Hyperparameters and settings
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
TEST_SIZE = 0.2
EPOCHS = 4
LEARNING_RATE = 1e-05
PADDING = "max_length"
RANDOM_STATE = 200

# Columns for text and target
TEXT_COLUMN = "clean_text"  # Make sure your DataFrame has this column; otherwise adjust accordingly.
TARGET_COLUMN = "target"

# =============================================================================
# 3. DATA UPLOAD & PREPARATION
# =============================================================================
# ____________________________________________________________
# Load the  News Dataset

two_steps_back = os.path.dirname(os.path.dirname(os.getcwd()))
data_path = os.path.join(two_steps_back, 'data')
vaccination_recovery_news = pd.read_csv(os.path.join(data_path, 'vaccination-recovery-news-data.csv'))
vaccination_recovery_news['synthetic'] = False

# Create a DataFrame for original articles with label 1
vaccination_recovery_news_train = vaccination_recovery_news[['body_text', 'reliability']].copy()
vaccination_recovery_news_train.columns = [TEXT_COLUMN, TARGET_COLUMN]
vaccination_recovery_news_train['synthetic'] = False

# Print the ratio between the classes

print("Ratio between reliable and unreliable data: ",vaccination_recovery_news_train[TARGET_COLUMN].value_counts(normalize = True))


#____________________________________________________________
# Load synthetic data

path = os.path.join(two_steps_back, 'data', 'vaccination_synthetic_articles_combined.csv')
df_new = pd.read_csv(path)
df_new['synthetic'] = True


# Create a DataFrame for changed articles with label 0
df_changed = df_new[['Changed_article']].dropna().copy()
df_changed[TEXT_COLUMN] = df_changed['Changed_article']
df_changed[TARGET_COLUMN] = 0
df_changed['synthetic'] = True


df_synthetic_train =  df_changed[[TEXT_COLUMN, TARGET_COLUMN, 'synthetic']]



#Combine the two datasets
df = pd.concat([df_synthetic_train, vaccination_recovery_news_train], ignore_index=True, axis=0)
df = df.reset_index(drop=True)

# =============================================================================
# 3. RUN ROBERTA
# =============================================================================



# Initialize the Roberta tokenizer.
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base', truncation=True, do_lower_case=True)


# Create an instance of our trainer.
trainer = RobertaTrainer(
    df=df,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
    train_batch_size=TRAIN_BATCH_SIZE,
    valid_batch_size=VALID_BATCH_SIZE,
    test_size=TEST_SIZE,
    device=DEVICE,
    padding=PADDING,
    text_column=TEXT_COLUMN,
    target_column=TARGET_COLUMN,
    random_state=RANDOM_STATE,


)

# Step 2: Tokenize the data and create DataLoaders.
trainer.tokenize_data()

# Step 3: Build the model.
trainer.build_model()

# Step 4: Train the model.
trainer.train_model(epochs=EPOCHS, learning_rate=LEARNING_RATE)

# Step 5: Evaluate the model.
acc, prec, rec = trainer.evaluate_model()
print(f"Test Accuracy: {acc:.2f}%")
print(f"Test Precision: {prec * 100:.2f}%")
print(f"Test Recall: {rec * 100:.2f}%")

# Save the model and tokenizer.
output_model_file = 'pytorch_roberta_vaccination_and_synthetic.bin'
output_tokenizer_file = 'pytorch_roberta_vaccination_and_synthetic_tokenizer.bin'
output_vocab_file = 'results_ROBERTA'
directory_to_save = os.path.join(os.getcwd(), "saved_model", output_model_file)
directory_to_save_tokenizer = os.path.join(os.getcwd(), "saved_model", output_tokenizer_file)

trainer.save_model(os.path.join(os.getcwd(), "saved_model", output_model_file), os.getcwd())
tokenizer.save_vocabulary(directory_to_save, directory_to_save_tokenizer)
print('Model and tokenizer saved.')

Ratio between reliable and unreliable data:  target
1    0.652361
0    0.347639
Name: proportion, dtype: float64


KeyError: ('clean_text', 'target', 'synthetic')