In [4]:
# =============================================================================
# 1. IMPORT LIBRARIES
# =============================================================================
import os
import logging
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from transformers import RobertaModel, RobertaTokenizer
import sys

current_dir = os.getcwd()
two_steps_back = os.path.dirname(os.path.dirname(current_dir))
sys.path.append(two_steps_back)
from RoBERTa_classifier import RobertaTrainer, RobertaTokenizer

logging.basicConfig(level=logging.ERROR)

# =============================================================================
# 2. CONSTANTS & VARIABLES
# =============================================================================
# Device configuration
DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Data paths
BASE_DIR = os.path.dirname(os.getcwd())
TWO_FOLDERS_BACK = os.path.dirname(os.path.dirname(os.getcwd()))
DATA_FOLDER = os.path.join(TWO_FOLDERS_BACK, 'data')
DATA_PATH = os.path.join(DATA_FOLDER, 'MMCoVaR_News_Dataset.csv')

# Hyperparameters and settings
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
TEST_SIZE = 0.2
EPOCHS = 4
LEARNING_RATE = 1e-05
PADDING = "max_length"
RANDOM_STATE = 200

# Columns for text and target
TEXT_COLUMN = "clean_text"  # Make sure your DataFrame has this column; otherwise adjust accordingly.
TARGET_COLUMN = "target"

# =============================================================================
# 3. DATA UPLOAD & PREPARATION
# =============================================================================
# ____________________________________________________________
# Load the  News Dataset

two_steps_back = os.path.dirname(os.path.dirname(os.getcwd()))
data_path = os.path.join(two_steps_back, 'data')
vaccination_recovery_news = pd.read_csv(os.path.join(data_path, 'vaccination-recovery-news-data.csv'))
vaccination_recovery_news['synthetic'] = False

# Create a DataFrame for original articles with label 1
vaccination_recovery_news_train = vaccination_recovery_news[['body_text', 'reliability']].copy()
vaccination_recovery_news_train.columns = [TEXT_COLUMN, TARGET_COLUMN]
vaccination_recovery_news_train['synthetic'] = False

df = vaccination_recovery_news_train

# Print the ratio between the classes

print("Ratio of target value:\n",df['target'].value_counts(normalize = True))

# =============================================================================
# 3. RUN ROBERTA
# =============================================================================



# Initialize the Roberta tokenizer.
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base', truncation=True, do_lower_case=True)


# Create an instance of our trainer.
trainer = RobertaTrainer(
    df=df,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
    train_batch_size=TRAIN_BATCH_SIZE,
    valid_batch_size=VALID_BATCH_SIZE,
    test_size=TEST_SIZE,
    device=DEVICE,
    padding=PADDING,
    text_column=TEXT_COLUMN,
    target_column=TARGET_COLUMN,
    random_state=RANDOM_STATE,


)

# Step 2: Tokenize the data and create DataLoaders.
trainer.tokenize_data()

# Step 3: Build the model.
trainer.build_model()

# Step 4: Train the model.
trainer.train_model(epochs=EPOCHS, learning_rate=LEARNING_RATE)

# Step 5: Evaluate the model.
acc, prec, rec = trainer.evaluate_model()
print(f"Test Accuracy: {acc:.2f}%")
print(f"Test Precision: {prec * 100:.2f}%")
print(f"Test Recall: {rec * 100:.2f}%")

# Save the model and tokenizer.
output_model_file = 'pytorch_roberta_vaccination.bin'
output_tokenizer_file = 'pytorch_roberta_vaccination_tokenizer.bin'
output_vocab_file = 'results_ROBERTA'
directory_to_save = os.path.join(os.getcwd(), "saved_model", output_model_file)
directory_to_save_tokenizer = os.path.join(os.getcwd(), "saved_model", output_tokenizer_file)

trainer.save_model(os.path.join(os.getcwd(), "saved_model", output_model_file), os.getcwd())
tokenizer.save_vocabulary(directory_to_save, directory_to_save_tokenizer)
print('Model and tokenizer saved.')

Ratio of target value:
 target
1    0.652361
0    0.347639
Name: proportion, dtype: float64
TRAIN Dataset shape: (559,)
TEST Dataset shape: (140,)


Training:   0%|          | 0/70 [00:00<?, ?it/s]

Step 0: Training Loss: 0.6764, Training Accuracy: 62.50%


Training:  71%|███████▏  | 50/70 [06:17<02:33,  7.66s/it]

Step 50: Training Loss: 0.6183, Training Accuracy: 66.18%


Training: 100%|██████████| 70/70 [08:50<00:00,  7.58s/it]


Epoch 1: Training Loss: 0.5913, Accuracy: 66.01%


Training:   0%|          | 0/70 [00:00<?, ?it/s]

Step 0: Training Loss: 0.2701, Training Accuracy: 87.50%


Training:  71%|███████▏  | 50/70 [07:28<03:41, 11.10s/it]

Step 50: Training Loss: 0.3496, Training Accuracy: 88.48%


Training: 100%|██████████| 70/70 [11:21<00:00,  9.74s/it]


Epoch 2: Training Loss: 0.3400, Accuracy: 88.73%


Training:   0%|          | 0/70 [00:00<?, ?it/s]

Step 0: Training Loss: 0.0974, Training Accuracy: 100.00%


Training:  71%|███████▏  | 50/70 [09:37<03:59, 11.96s/it]

Step 50: Training Loss: 0.2341, Training Accuracy: 92.16%


Training: 100%|██████████| 70/70 [13:26<00:00, 11.52s/it]


Epoch 3: Training Loss: 0.2265, Accuracy: 92.49%


Training:   0%|          | 0/70 [00:00<?, ?it/s]

Step 0: Training Loss: 0.0723, Training Accuracy: 100.00%


Training:  71%|███████▏  | 50/70 [09:40<03:53, 11.68s/it]

Step 50: Training Loss: 0.1249, Training Accuracy: 96.57%


Training: 100%|██████████| 70/70 [13:35<00:00, 11.64s/it]


Epoch 4: Training Loss: 0.1566, Accuracy: 95.53%


Validation: 100%|██████████| 35/35 [00:57<00:00,  1.64s/it]


Validation Loss: 0.1527, Accuracy: 94.29%
Precision: 0.96, Recall: 0.96
Test Accuracy: 94.29%
Test Precision: 95.60%
Test Recall: 95.60%
Model state_dict saved to c:\Users\Lukag\OneDrive\Documents\git\IJS\text_clasification_2\news_classifier_model\RoBERTa_classification\saved_model\pytorch_roberta_vaccination.bin


Vocabulary path (c:\Users\Lukag\OneDrive\Documents\git\IJS\text_clasification_2\news_classifier_model\RoBERTa_classification\saved_model\pytorch_roberta_vaccination.bin) should be a directory


Tokenizer vocabulary saved to c:\Users\Lukag\OneDrive\Documents\git\IJS\text_clasification_2\news_classifier_model\RoBERTa_classification
Model and tokenizer saved.
