In [None]:
from google.colab import drive
drive.mount('/content/drive')
drive_folder = '/content/drive/Shareddrives/682_Drive'
# Adjust this line to be the assignment1 folder in your google drive
notebook_folder = drive_folder + '/682-Project'
%cd {notebook_folder}

In [None]:
pip install pandas torch transformers datasets

In [3]:
import pandas as pd
import torch
import re
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoConfig


##Data Preparation for AG News Dataset
In this section, we will process the AG News dataset, sourced from Hugging Face's datasets library, to create an imbalanced dataset suitable for analysis. The AG News dataset contains four classes of news articles:

World (label 0)
Sports (label 1)
Business (label 2)
Sci/Tech (label 3)
Process Overview
Data Loading:
The AG News dataset will be loaded and converted into a Pandas DataFrame to enable easier manipulation and preprocessing.

Text Preprocessing:
Each article's text will be cleaned by removing special characters, extra spaces, and converting all text to lowercase for uniformity.

Class Separation and Downsampling:

We will separate the dataset by class.
Each class will be downsampled to a specific number of samples to create class imbalance:
World: 4,000 samples
Sports: 7,000 samples
Business: 3,000 samples
Sci/Tech: 2,000 samples
Dataset Reconstruction and Shuffling:
The downsampled data from all classes will be combined, and the resulting dataset will be shuffled to ensure the samples are randomized.

Class Distribution Check:
A summary of the new class distribution will be printed to confirm the imbalance.

Saving the Dataset:
The imbalanced dataset will be saved as a CSV file in Google Drive for future use in experiments, such as training baseline models or data augmentation studies.

This process allows us to simulate a real-world scenario where some classes are underrepresented

In [None]:
# Load the AG News dataset from Hugging Face's `datasets` library
dataset = load_dataset('ag_news', split='train')

# Convert to a DataFrame for easier manipulation
data = dataset.to_pandas()

# Text cleaning function
def clean_text(text):
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip().lower()  # Convert to lowercase and remove trailing spaces

# Apply cleaning to the dataset
data['text'] = data['text'].apply(clean_text)

# Separate classes
world_data = data[data['label'] == 0]      # Class 0: World
sports_data = data[data['label'] == 1]     # Class 1: Sports
business_data = data[data['label'] == 2]   # Class 2: Business
sci_tech_data = data[data['label'] == 3]   # Class 3: Sci/Tech

# Downsample each class to different levels to create imbalance
world_data_downsampled = world_data.sample(n=500, random_state=42)    # 4000 samples
sports_data_downsampled = sports_data.sample(n=7000, random_state=42)  # 7000 samples
business_data_downsampled = business_data.sample(n=3000, random_state=42)  # 3000 samples
sci_tech_data_downsampled = sci_tech_data.sample(n=2000, random_state=42)  # 2000 samples

# Combine the datasets back to create the imbalanced dataset
imbalanced_data = pd.concat([
    world_data_downsampled,
    sports_data_downsampled,
    business_data_downsampled,
    sci_tech_data_downsampled
])

# Shuffle the dataset
imbalanced_data = imbalanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the class distribution
print("Class distribution in the imbalanced dataset:")
print(imbalanced_data['label'].value_counts())

# Save the imbalanced dataset to a CSV file
imbalanced_data.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv', index=False)


In [None]:
def load_news_dataset(path):
    # Load the dataset
    data = pd.read_csv(file_path)

    # Display the first few rows of the dataframe
    print(data.head())
    return data

NEW CODE

In [None]:
# Generate articles one by one using API
def generate_single_article(prompt, max_new_tokens=200):
    """Generate a single article using the Hugging Face API."""
    article = call_hf_api(prompt, max_new_tokens=max_new_tokens)
    if article:
        # Split output by the delimiter and take the first valid article
        articles = article.split("<|endofarticle|>")
        return articles[0].strip() if articles[0].strip() else None
    return None

# Generate multiple augmented articles
def generate_augmented_articles(prompt, total_samples):
    generated_articles = []
    for _ in tqdm(range(total_samples), desc="Generating articles"):
        article = generate_single_article(prompt)
        if article:
            generated_articles.append(article)
    return generated_articles

# Generate the required number of augmented samples
final_articles = generate_augmented_articles(base_prompt, c)


Generating articles: 100%|██████████| 200/200 [00:42<00:00,  4.75it/s]


In [None]:
import pandas as pd
import random

# Load the imbalanced dataset
df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')

# Filter the 'world' category (assuming category '0' corresponds to 'world')
world_articles = df[df['label'] == 0]

# Randomly sample 90 articles
sampled_articles = world_articles.sample(n=100, random_state=42)

# Save the sampled articles to a new CSV file
sampled_articles.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/world_articles_sampled.csv', index=False)


# Text Augmentation - Novel : LLM (formal to informal)

In [None]:
import pandas as pd
from huggingface_hub import InferenceClient
import random

# Initialize the Hugging Face Inference Client
client = InferenceClient(api_key="<your-hf-token>")

# Load the imbalanced dataset
df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')

# Filter the 'world' category (category '0' corresponds to 'world')
world_articles = df[df['label'] == 0]

# Randomly sample 90 articles
sampled_articles = world_articles.sample(n=100, random_state=42)

# Function to generate variations using the Hugging Face API
def generate_variations(article, batch_size=15):
    messages = [
        {
            "role": "user",
            "content": f"Please provide {batch_size} different variations of the News Text Article with an informal tone. \n"
                       f"Output the full sentences. Output in format \"1. sentence 1, 2. sentence 2, ... , {batch_size}. sentence {batch_size}\". \\n\n"
                       f"Article: \"{article}\""
        }
    ]

    # Create the request to the API
    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.2-1B-Instruct",
        messages=messages,
        temperature=0.6,
        max_tokens=1792,
        top_p=0.7,
        stream=True
    )

    # Stream the response and gather all variations
    variations = ""
    for chunk in stream:
        variations += chunk.choices[0].delta.content
    return variations

# Create a new list to store individual augmented rows
augmented_rows = []

# Process the sampled articles and generate variations
for _, row in sampled_articles.iterrows():
    article = row['text']  # Assuming the column with the article text is named 'text'
    print(f"Generating variations for article {row.name + 1}...")
    variations = generate_variations(article)

    # Split the variations into separate sentences
    split_variations = variations.split('\n')  # Split by line breaks
    for variation in split_variations:
        # Extract the text after the number (e.g., "1. ") if it exists
        if '. ' in variation:
            _, variation_text = variation.split('. ', 1)
            augmented_rows.append({'text': variation_text.strip(), 'label': row['label']})

# Save the augmented data to a CSV file
augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_informal.csv', index=False)

# # Combine the original dataset with augmented data if needed
# augmented_df['original_index'] = sampled_articles.index.repeat(len(split_variations))  # Track the original indices
# augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_with_original_indices.csv', index=False)


Generating variations for article 8644...
Generating variations for article 1634...
Generating variations for article 9047...
Generating variations for article 3692...
Generating variations for article 2409...
Generating variations for article 9510...
Generating variations for article 9135...
Generating variations for article 2802...
Generating variations for article 1437...
Generating variations for article 10952...
Generating variations for article 294...
Generating variations for article 4623...
Generating variations for article 9767...
Generating variations for article 1856...
Generating variations for article 8889...
Generating variations for article 9378...
Generating variations for article 12379...
Generating variations for article 595...
Generating variations for article 7551...
Generating variations for article 9865...
Generating variations for article 12180...
Generating variations for article 12212...
Generating variations for article 6599...
Generating variations for articl

# Classification - Baseline Model using LLM

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Load the original and augmented datasets
original_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')
augmented_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles.csv')

# Combine datasets
combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

def prepare_data(df):
    """Prepare the dataset for use with Hugging Face's Trainer."""
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['text'], df['label'].astype(int), test_size=0.2, random_state=42, stratify=df['label']
    )
    train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})
    test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})
    return train_data, test_data

# Prepare data for both datasets
train_original, test_original = prepare_data(original_df)
train_combined, test_combined = prepare_data(combined_df)

# Initialize the RoBERTa tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize datasets
train_original = train_original.map(tokenize_function, batched=True)
test_original = test_original.map(tokenize_function, batched=True)
train_combined = train_combined.map(tokenize_function, batched=True)
test_combined = test_combined.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=[],  # Disable all loggers like wandb
)

def train_and_get_metrics(train_data, test_data):
    """Train the model and return F1, precision, and recall for each class."""
    # Reinitialize the model for each training run
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )
    trainer.train()
    predictions = trainer.predict(test_data)
    preds = predictions.predictions.argmax(axis=1)

    # Generate a classification report
    report = classification_report(
        test_data['label'], preds, target_names=[f"Class {i}" for i in range(4)], output_dict=True
    )

    # Extract precision, recall, and F1 for each class
    metrics_per_class = {f"Class {i}": {"precision": report[f"Class {i}"]['precision'],
                                        "recall": report[f"Class {i}"]['recall'],
                                        "f1-score": report[f"Class {i}"]['f1-score']}
                         for i in range(4)}

    return metrics_per_class



metrics_augmented = train_and_get_metrics(train_combined, test_combined)


print("\nMetrics on Augmented Dataset:")
print(metrics_augmented)


# Classification - Baseline Model using LLM (new test set)

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Load the original and augmented datasets
original_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')
augmented_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles.csv')

# Combine datasets
combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

def prepare_data(df):
    """Prepare the dataset for use with Hugging Face's Trainer."""
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['text'], df['label'].astype(int), test_size=0.2, random_state=42, stratify=df['label']
    )
    train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})

    test_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/test_data.csv')
    test_texts, test_labels = test_df['text'], test_df['label'].astype(int)
    test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})
    return train_data, test_data

# Prepare data for both datasets
train_original, test_original = prepare_data(original_df)
train_combined, test_combined = prepare_data(combined_df)

# Initialize the RoBERTa tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize datasets
train_original = train_original.map(tokenize_function, batched=True)
test_original = test_original.map(tokenize_function, batched=True)
train_combined = train_combined.map(tokenize_function, batched=True)
test_combined = test_combined.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=[],  # Disable all loggers like wandb
)

def train_and_get_metrics(train_data, test_data):
    """Train the model and return F1, precision, and recall for each class."""
    # Reinitialize the model for each training run
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )
    trainer.train()
    predictions = trainer.predict(test_data)
    preds = predictions.predictions.argmax(axis=1)

   # Generate a classification report
    report = classification_report(
        test_data['label'], preds, target_names=[f"Class {i}" for i in range(4)], output_dict=True
    )

    # Extract accuracy
    accuracy = report['accuracy']

    # Extract precision, recall, and F1 for each class
    metrics_per_class = {f"Class {i}": {"precision": report[f"Class {i}"]['precision'],
                                        "recall": report[f"Class {i}"]['recall'],
                                        "f1-score": report[f"Class {i}"]['f1-score']}
                         for i in range(4)}

    # Include overall accuracy
    metrics_per_class['accuracy'] = accuracy

    return metrics_per_class



metrics_augmented = train_and_get_metrics(train_combined, test_combined)


print("\nMetrics on Augmented Dataset:")
print(metrics_augmented)


# Classification - Novel : LLM (formal to informal)

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Load the original and augmented datasets
original_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')
augmented_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_informal.csv')

# Combine datasets
combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

def prepare_data(df):
    """Prepare the dataset for use with Hugging Face's Trainer."""
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['text'], df['label'].astype(int), test_size=0.2, random_state=42, stratify=df['label']
    )
    train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})
    test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})
    return train_data, test_data

# Prepare data for both datasets
train_original, test_original = prepare_data(original_df)
train_combined, test_combined = prepare_data(combined_df)

# Initialize the RoBERTa tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize datasets
train_original = train_original.map(tokenize_function, batched=True)
test_original = test_original.map(tokenize_function, batched=True)
train_combined = train_combined.map(tokenize_function, batched=True)
test_combined = test_combined.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=[],  # Disable all loggers like wandb
)

def train_and_get_metrics(train_data, test_data):
    """Train the model and return F1, precision, and recall for each class."""
    # Reinitialize the model for each training run
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )
    trainer.train()
    predictions = trainer.predict(test_data)
    preds = predictions.predictions.argmax(axis=1)

    # Generate a classification report
    report = classification_report(
        test_data['label'], preds, target_names=[f"Class {i}" for i in range(4)], output_dict=True
    )

    # Extract precision, recall, and F1 for each class
    metrics_per_class = {f"Class {i}": {"precision": report[f"Class {i}"]['precision'],
                                        "recall": report[f"Class {i}"]['recall'],
                                        "f1-score": report[f"Class {i}"]['f1-score']}
                         for i in range(4)}

    return metrics_per_class

# Train and evaluate both datasets
metrics_non_augmented = train_and_get_metrics(train_original, test_original)
metrics_augmented = train_and_get_metrics(train_combined, test_combined)

# Print the results
print("Metrics on Non-Augmented Dataset:")
print(metrics_non_augmented)

print("\nMetrics on Augmented Dataset:")
print(metrics_augmented)


In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Text Augmentation - Novel: LLM (investigative, editorial and breaking news)

In [None]:
import pandas as pd
from huggingface_hub import InferenceClient
import random

# Initialize the Hugging Face Inference Client
client = InferenceClient(api_key="<your-hf-token>")

# Load the imbalanced dataset
df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')

# Filter the 'world' category (category '0' corresponds to 'world')
world_articles = df[df['label'] == 0]

# Randomly sample 90 articles
sampled_articles = world_articles.sample(n=100, random_state=42)

# Function to generate variations using the Hugging Face API
def generate_variations(article, batch_size=15):
    # messages = [
    #     {
    #         "role": "user",
    #         "content": f"Please provide {batch_size} different variations of the News Text Article with 5 in investigative tone, 5 in editorial tone and 5 in breaking-news tone. \n"
    #                    f"Output the full sentences. Output in format \"1. sentence 1, 2. sentence 2, ... , {batch_size}. sentence {batch_size}\". \\n\n"
    #                    f"Article: \"{article}\""
    #     }
    # ]
    messages = [
    {
        "role": "user",
        "content": (
            f"Please generate 15 variations of the provided News Text Article, dividing them equally into three distinct journalistic styles: "
            f"1. Investigative (in-depth analysis with detailed evidence and complex arguments), "
            f"2. Editorial (opinionated and persuasive tone reflecting the writer's viewpoint), "
            f"3. Breaking News (concise, urgent, and fact-driven content). \n\n"
            f"For each style, generate 5 variations in full sentences. Output in the following format: \n"
            f"\"Investigative: 1. sentence 1, 2. sentence 2, ..., 5. sentence 5\n"
            f"Editorial: 1. sentence 1, 2. sentence 2, ..., 5. sentence 5\n"
            f"Breaking News: 1. sentence 1, 2. sentence 2, ..., 5. sentence 5\"\n\n"
            f"Article: \"{article}\""
        )
    }
]


    # Create the request to the API
    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.2-1B-Instruct",
        messages=messages,
        temperature=0.6,
        max_tokens=1792,
        top_p=0.7,
        stream=True
    )

    # Stream the response and gather all variations
    variations = ""
    for chunk in stream:
        variations += chunk.choices[0].delta.content
    # print(f"this is original article: \n {article}")
    # print(variations)
    return variations


# Create a new list to store individual augmented rows
augmented_rows = []

# Process the sampled articles and generate variations
for _, row in sampled_articles.iterrows():
    article = row['text']  # Assuming the column with the article text is named 'text'
    print(f"Generating variations for article {row.name + 1}...")
    variations = generate_variations(article)

    # Split the variations into separate sentences
    split_variations = variations.split('\n')  # Split by line breaks
    for variation in split_variations:
        # Extract the text after the number (e.g., "1. ") if it exists
        if '. ' in variation:
            _, variation_text = variation.split('. ', 1)
            augmented_rows.append({'text': variation_text.strip(), 'label': row['label']})


# Save the augmented data to a CSV file
augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_world_train.csv', index=False)

# # Combine the original dataset with augmented data if needed
# augmented_df['original_index'] = sampled_articles.index.repeat(len(split_variations))  # Track the original indices
# augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_with_original_indices.csv', index=False)


Generating variations for article 8644...
Generating variations for article 1634...
Generating variations for article 9047...
Generating variations for article 3692...
Generating variations for article 2409...
Generating variations for article 9510...
Generating variations for article 9135...
Generating variations for article 2802...
Generating variations for article 1437...
Generating variations for article 10952...
Generating variations for article 294...
Generating variations for article 4623...
Generating variations for article 9767...
Generating variations for article 1856...
Generating variations for article 8889...
Generating variations for article 9378...
Generating variations for article 12379...
Generating variations for article 595...
Generating variations for article 7551...
Generating variations for article 9865...
Generating variations for article 12180...
Generating variations for article 12212...
Generating variations for article 6599...
Generating variations for articl

# Classification - Novel: LLM (investigative, editorial and breaking news)

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Load the original and augmented datasets
original_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')
augmented_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_world_train.csv')

# Combine datasets
combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

def prepare_data(train_df):
    """Prepare the dataset for use with Hugging Face's Trainer."""
    # train_texts, test_texts, train_labels, test_labels = train_test_split(
    #     df['text'], df['label'].astype(int), test_size=0.0, random_state=42, stratify=df['label']
    # )
    train_texts, train_labels = train_df['text'], train_df['label'].astype(int)
    train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})

    test_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/test_data.csv')
    test_texts, test_labels = test_df['text'], test_df['label'].astype(int)
    test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})
    return train_data, test_data

# Prepare data for both datasets
train_combined, test_combined = prepare_data(combined_df)

# Initialize the RoBERTa tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize datasets

train_combined = train_combined.map(tokenize_function, batched=True)
test_combined = test_combined.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=[],  # Disable all loggers like wandb
)

def train_and_get_metrics(train_data, test_data):
    """Train the model and return F1, precision, and recall for each class."""
    # Reinitialize the model for each training run
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )
    trainer.train()
    predictions = trainer.predict(test_data)
    preds = predictions.predictions.argmax(axis=1)

  # Generate a classification report
    report = classification_report(
        test_data['label'], preds, target_names=[f"Class {i}" for i in range(4)], output_dict=True
    )

    # Extract accuracy
    accuracy = report['accuracy']

    # Extract precision, recall, and F1 for each class
    metrics_per_class = {f"Class {i}": {"precision": report[f"Class {i}"]['precision'],
                                        "recall": report[f"Class {i}"]['recall'],
                                        "f1-score": report[f"Class {i}"]['f1-score']}
                         for i in range(4)}

    # Include overall accuracy
    metrics_per_class['accuracy'] = accuracy

    return metrics_per_class



metrics_augmented = train_and_get_metrics(train_combined, test_combined)


print("\nMetrics on Augmented Dataset:")
print(metrics_augmented)


# Ignore - Testing : creating testdata from baseline

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Load the original and augmented datasets
original_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')
augmented_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles.csv')

# Combine datasets
combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

def prepare_data(df, save_test_path):
    """Prepare the dataset for use with Hugging Face's Trainer."""
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['text'], df['label'].astype(int), test_size=0.2, random_state=42, stratify=df['label']
    )
    train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})
    test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})
    test_df = test_data.to_pandas()
    test_df.to_csv(save_test_path, index=False)
    return train_data, test_data

test_data_path = '/content/drive/Shareddrives/682_Drive/682-Project/test_data_from_baseline.csv'
# Prepare data for both datasets
train_combined, test_combined = prepare_data(combined_df, test_data_path)

# Creating Business Test Set

In [None]:
import pandas as pd
from huggingface_hub import InferenceClient
import random

# Initialize the Hugging Face Inference Client
client = InferenceClient(api_key="<your-hf-token>")

# Load the imbalanced dataset
df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')

# Filter the 'world' category (category '0' corresponds to 'world')
sports_articles = df[df['label'] == 1]

# Randomly sample 100 articles
sampled_articles = sports_articles.sample(n=100, random_state=42)

# Function to generate variations using the Hugging Face API
def generate_variations(article):
    messages = [
    {
        "role": "user",
        "content": (
            f"Please generate 6 variations of the provided News Text Article, dividing them equally into three distinct journalistic styles: "
            f"1. Investigative (in-depth analysis with detailed evidence and complex arguments), "
            f"2. Editorial (opinionated and persuasive tone reflecting the writer's viewpoint), "
            f"3. Breaking News (concise, urgent, and fact-driven content). \n\n"
            f"For each style, generate 2 variations in full sentences. Output in the following format: \n"
            f"\"Investigative: 1. sentence 1, 2. sentence 2 \n"
            f"Editorial: 1. sentence 1, 2. sentence 2 \n"
            f"Breaking News: 1. sentence 1, 2. sentence 2 \"\n\n"
            f"Article: \"{article}\""
        )
    }
]


    # Create the request to the API
    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.2-1B-Instruct",
        messages=messages,
        temperature=0.6,
        max_tokens=1792,
        top_p=0.7,
        stream=True
    )

    # Stream the response and gather all variations
    variations = ""
    for chunk in stream:
        variations += chunk.choices[0].delta.content
    # print(f"this is original article: \n {article}")
    # print(variations)
    return variations


# Create a new list to store individual augmented rows
augmented_rows = []

# Process the sampled articles and generate variations
for _, row in sampled_articles.iterrows():
    article = row['text']  # Assuming the column with the article text is named 'text'
    print(f"Generating variations for article {row.name + 1}...")
    variations = generate_variations(article)

    # Split the variations into separate sentences
    split_variations = variations.split('\n')  # Split by line breaks
    for variation in split_variations:
        # Extract the text after the number (e.g., "1. ") if it exists

        if '. ' in variation:
            _, variation_text = variation.split('. ', 1)
            augmented_rows.append({'text': variation_text.strip(), 'label': row['label']})


# Save the augmented data to a CSV file
augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_sports.csv', index=False)

# # Combine the original dataset with augmented data if needed
# augmented_df['original_index'] = sampled_articles.index.repeat(len(split_variations))  # Track the original indices
# augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_with_original_indices.csv', index=False)


Generating variations for article 11635...
Generating variations for article 5260...
Generating variations for article 3562...
Generating variations for article 468...
Generating variations for article 7765...
Generating variations for article 6098...
Generating variations for article 12061...
Generating variations for article 11134...
Generating variations for article 11378...
Generating variations for article 9973...
Generating variations for article 227...
Generating variations for article 11467...
Generating variations for article 7622...
Generating variations for article 4252...
Generating variations for article 10205...
Generating variations for article 5800...
Generating variations for article 10788...
Generating variations for article 4001...
Generating variations for article 3420...
Generating variations for article 1344...
Generating variations for article 5654...
Generating variations for article 5423...
Generating variations for article 5458...
Generating variations for art

# Creating Sports Test Set

In [None]:
import pandas as pd
from huggingface_hub import InferenceClient
import random

# Initialize the Hugging Face Inference Client
client = InferenceClient(api_key="<your-hf-token>")

# Load the imbalanced dataset
df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')

# Filter the 'world' category (category '0' corresponds to 'world')
sports_articles = df[df['label'] == 2]

# Randomly sample 100 articles
sampled_articles = sports_articles.sample(n=100, random_state=42)

# Function to generate variations using the Hugging Face API
def generate_variations(article):
    messages = [
    {
        "role": "user",
        "content": (
            f"Please generate 6 variations of the provided News Text Article, dividing them equally into three distinct journalistic styles: "
            f"1. Investigative (in-depth analysis with detailed evidence and complex arguments), "
            f"2. Editorial (opinionated and persuasive tone reflecting the writer's viewpoint), "
            f"3. Breaking News (concise, urgent, and fact-driven content). \n\n"
            f"For each style, generate 2 variations in full sentences. Output in the following format: \n"
            f"\"Investigative: 1. sentence 1, 2. sentence 2 \n"
            f"Editorial: 1. sentence 1, 2. sentence 2 \n"
            f"Breaking News: 1. sentence 1, 2. sentence 2 \"\n\n"
            f"Article: \"{article}\""
        )
    }
]


    # Create the request to the API
    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.2-1B-Instruct",
        messages=messages,
        temperature=0.6,
        max_tokens=1792,
        top_p=0.7,
        stream=True
    )

    # Stream the response and gather all variations
    variations = ""
    for chunk in stream:
        variations += chunk.choices[0].delta.content
    # print(f"this is original article: \n {article}")
    # print(variations)
    return variations


# Create a new list to store individual augmented rows
augmented_rows = []

# Process the sampled articles and generate variations
for _, row in sampled_articles.iterrows():
    article = row['text']  # Assuming the column with the article text is named 'text'
    print(f"Generating variations for article {row.name + 1}...")
    variations = generate_variations(article)

    # Split the variations into separate sentences
    split_variations = variations.split('\n')  # Split by line breaks
    for variation in split_variations:
        # Extract the text after the number (e.g., "1. ") if it exists
        if '. ' in variation:
            _, variation_text = variation.split('. ', 1)
            augmented_rows.append({'text': variation_text.strip(), 'label': row['label']})


# Save the augmented data to a CSV file
augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_business.csv', index=False)

# # Combine the original dataset with augmented data if needed
# augmented_df['original_index'] = sampled_articles.index.repeat(len(split_variations))  # Track the original indices
# augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_with_original_indices.csv', index=False)


Generating variations for article 7475...
Generating variations for article 4984...
Generating variations for article 7515...
Generating variations for article 1033...
Generating variations for article 10382...
Generating variations for article 4711...
Generating variations for article 5949...
Generating variations for article 8737...
Generating variations for article 1621...
Generating variations for article 10659...
Generating variations for article 9351...
Generating variations for article 236...
Generating variations for article 2706...
Generating variations for article 7341...
Generating variations for article 11492...
Generating variations for article 3806...
Generating variations for article 4495...
Generating variations for article 1913...
Generating variations for article 11339...
Generating variations for article 10544...
Generating variations for article 205...
Generating variations for article 1203...
Generating variations for article 3851...
Generating variations for artic

# Creating SciTech Test Set

In [None]:
import pandas as pd
from huggingface_hub import InferenceClient
import random

# Initialize the Hugging Face Inference Client
client = InferenceClient(api_key="<your-hf-token>")

# Load the imbalanced dataset
df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')


scitech_articles = df[df['label'] == 3]

# Randomly sample 100 articles
sampled_articles = scitech_articles.sample(n=100, random_state=42)

# Function to generate variations using the Hugging Face API
def generate_variations(article):
    messages = [
    {
        "role": "user",
        "content": (
            f"Please generate 6 variations of the provided News Text Article, dividing them equally into three distinct journalistic styles: "
            f"1. Investigative (in-depth analysis with detailed evidence and complex arguments), "
            f"2. Editorial (opinionated and persuasive tone reflecting the writer's viewpoint), "
            f"3. Breaking News (concise, urgent, and fact-driven content). \n\n"
            f"For each style, generate 2 variations in full sentences. Output in the following format: \n"
            f"\"Investigative: 1. sentence 1, 2. sentence 2 \n"
            f"Editorial: 1. sentence 1, 2. sentence 2 \n"
            f"Breaking News: 1. sentence 1, 2. sentence 2 \"\n\n"
            f"Article: \"{article}\""
        )
    }
]


    # Create the request to the API
    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.2-1B-Instruct",
        messages=messages,
        temperature=0.6,
        max_tokens=1792,
        top_p=0.7,
        stream=True
    )

    # Stream the response and gather all variations
    variations = ""
    for chunk in stream:
        variations += chunk.choices[0].delta.content
    # print(f"this is original article: \n {article}")
    # print(variations)
    return variations


# Create a new list to store individual augmented rows
augmented_rows = []

# Process the sampled articles and generate variations
for _, row in sampled_articles.iterrows():
    article = row['text']  # Assuming the column with the article text is named 'text'
    print(f"Generating variations for article {row.name + 1}...")
    variations = generate_variations(article)

    # Split the variations into separate sentences
    split_variations = variations.split('\n')  # Split by line breaks
    for variation in split_variations:
        # Extract the text after the number (e.g., "1. ") if it exists
        if '. ' in variation:
            _, variation_text = variation.split('. ', 1)
            augmented_rows.append({'text': variation_text.strip(), 'label': row['label']})


# Save the augmented data to a CSV file
augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_scitech.csv', index=False)

# # Combine the original dataset with augmented data if needed
# augmented_df['original_index'] = sampled_articles.index.repeat(len(split_variations))  # Track the original indices
# augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_with_original_indices.csv', index=False)


Generating variations for article 11735...
Generating variations for article 2365...
Generating variations for article 8617...
Generating variations for article 5751...
Generating variations for article 8347...
Generating variations for article 8223...
Generating variations for article 5964...
Generating variations for article 10992...
Generating variations for article 461...
Generating variations for article 8545...
Generating variations for article 391...
Generating variations for article 8359...
Generating variations for article 7180...
Generating variations for article 3784...
Generating variations for article 2492...
Generating variations for article 1859...
Generating variations for article 4743...
Generating variations for article 931...
Generating variations for article 10399...
Generating variations for article 11681...
Generating variations for article 4338...
Generating variations for article 10508...
Generating variations for article 12410...
Generating variations for artic

# Creating World Test Set

In [None]:
import pandas as pd
from huggingface_hub import InferenceClient
import random

# Initialize the Hugging Face Inference Client
client = InferenceClient(api_key="<your-hf-token>")

# Load the imbalanced dataset
df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')


world_articles = df[df['label'] == 0]

# Randomly sample 100 articles
sampled_articles = world_articles.sample(n=100, random_state=1)

# Function to generate variations using the Hugging Face API
def generate_variations(article):
    messages = [
    {
        "role": "user",
        "content": (
            f"Please generate 6 variations of the provided News Text Article, dividing them equally into three distinct journalistic styles: "
            f"1. Investigative (in-depth analysis with detailed evidence and complex arguments), "
            f"2. Editorial (opinionated and persuasive tone reflecting the writer's viewpoint), "
            f"3. Breaking News (concise, urgent, and fact-driven content). \n\n"
            f"For each style, generate 2 variations in full sentences. Output in the following format: \n"
            f"\"Investigative: 1. sentence 1, 2. sentence 2 \n"
            f"Editorial: 1. sentence 1, 2. sentence 2 \n"
            f"Breaking News: 1. sentence 1, 2. sentence 2 \"\n\n"
            f"Article: \"{article}\""
        )
    }
]


    # Create the request to the API
    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.2-1B-Instruct",
        messages=messages,
        temperature=0.6,
        max_tokens=1792,
        top_p=0.7,
        stream=True
    )

    # Stream the response and gather all variations
    variations = ""
    for chunk in stream:
        variations += chunk.choices[0].delta.content
    # print(f"this is original article: \n {article}")
    # print(variations)
    return variations


# Create a new list to store individual augmented rows
augmented_rows = []

# Process the sampled articles and generate variations
for _, row in sampled_articles.iterrows():
    article = row['text']  # Assuming the column with the article text is named 'text'
    print(f"Generating variations for article {row.name + 1}...")
    variations = generate_variations(article)

    # Split the variations into separate sentences
    split_variations = variations.split('\n')  # Split by line breaks
    for variation in split_variations:
        # Extract the text after the number (e.g., "1. ") if it exists
        if '. ' in variation:
            _, variation_text = variation.split('. ', 1)
            augmented_rows.append({'text': variation_text.strip(), 'label': row['label']})


# Save the augmented data to a CSV file
augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_world.csv', index=False)

# # Combine the original dataset with augmented data if needed
# augmented_df['original_index'] = sampled_articles.index.repeat(len(split_variations))  # Track the original indices
# augmented_df.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_with_original_indices.csv', index=False)


Generating variations for article 7079...
Generating variations for article 8231...
Generating variations for article 986...
Generating variations for article 1432...
Generating variations for article 11702...
Generating variations for article 12038...
Generating variations for article 7232...
Generating variations for article 605...
Generating variations for article 5756...
Generating variations for article 2087...
Generating variations for article 7731...
Generating variations for article 3911...
Generating variations for article 2718...
Generating variations for article 1428...
Generating variations for article 7118...
Generating variations for article 4398...
Generating variations for article 10576...
Generating variations for article 6839...
Generating variations for article 3282...
Generating variations for article 3480...
Generating variations for article 6943...
Generating variations for article 7337...
Generating variations for article 7283...
Generating variations for article

In [None]:
import pandas as pd

# File paths
files = [
    "/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_world.csv",
    "/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_business.csv",
    "/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_sports.csv",
    "/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_scitech.csv",
]

# List to store sampled dataframes
sampled_dfs = []

# Random state for reproducibility
random_state = 42

# Sample 600 articles from each file
for file in files:
    # Load the file
    df = pd.read_csv(file)
    # Sample 600 rows
    sampled_df = df.sample(n=600, random_state=random_state)
    # Append to the list
    sampled_dfs.append(sampled_df)

# Combine all sampled DataFrames
test_set = pd.concat(sampled_dfs, ignore_index=True)
test_set = test_set.sample(frac=1, random_state=42).reset_index(drop=True)


# Save the combined test set to a CSV file
output_path = "/content/drive/Shareddrives/682_Drive/682-Project/test_data.csv"
test_set.to_csv(output_path, index=False)

print(f"Test set created with {len(test_set)} articles and saved to {output_path}.")


Test set created with 2400 articles and saved to /content/drive/Shareddrives/682_Drive/682-Project/test_data.csv.


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Load the original and augmented datasets
original_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/train_data_new.csv')
augmented_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles.csv')

# Combine datasets
combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

def prepare_data(train_df):
    """Prepare the dataset for use with Hugging Face's Trainer."""
    # train_texts, test_texts, train_labels, test_labels = train_test_split(
    #     df['text'], df['label'].astype(int), test_size=0.0, random_state=42, stratify=df['label']
    # )
    train_texts, train_labels = train_df['text'], train_df['label'].astype(int)
    train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})

    test_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/test_data_new.csv')
    test_texts, test_labels = test_df['text'], test_df['label'].astype(int)
    test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})
    return train_data, test_data

# Prepare data for both datasets
train_original, test_original = prepare_data(original_df)
train_combined, test_combined = prepare_data(combined_df)

# Initialize the RoBERTa tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize datasets
train_original = train_original.map(tokenize_function, batched=True)
test_original = test_original.map(tokenize_function, batched=True)
train_combined = train_combined.map(tokenize_function, batched=True)
test_combined = test_combined.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=[],  # Disable all loggers like wandb
)

def train_and_get_metrics(train_data, test_data):
    """Train the model and return F1, precision, and recall for each class."""
    # Reinitialize the model for each training run
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )
    trainer.train()
    predictions = trainer.predict(test_data)
    preds = predictions.predictions.argmax(axis=1)

  # Generate a classification report
    report = classification_report(
        test_data['label'], preds, target_names=[f"Class {i}" for i in range(4)], output_dict=True
    )

    # Extract accuracy
    accuracy = report['accuracy']

    # Extract precision, recall, and F1 for each class
    metrics_per_class = {f"Class {i}": {"precision": report[f"Class {i}"]['precision'],
                                        "recall": report[f"Class {i}"]['recall'],
                                        "f1-score": report[f"Class {i}"]['f1-score']}
                         for i in range(4)}

    # Include overall accuracy
    metrics_per_class['accuracy'] = accuracy

    return metrics_per_class


# Train and evaluate both datasets
metrics_non_augmented = train_and_get_metrics(train_original, test_original)
metrics_augmented = train_and_get_metrics(train_combined, test_combined)

# Print the results
print("Metrics on Non-Augmented Dataset:")
print(metrics_non_augmented)

print("\nMetrics on Augmented Dataset:")
print(metrics_augmented)

# Classification- Style Transfer on new test data - Final!!

In [8]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Load the original and augmented datasets
original_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/train_data_new.csv')
augmented_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_world_train.csv')

# Combine datasets
combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

def prepare_data(train_df):
    """Prepare the dataset for use with Hugging Face's Trainer."""
    # train_texts, test_texts, train_labels, test_labels = train_test_split(
    #     df['text'], df['label'].astype(int), test_size=0.0, random_state=42, stratify=df['label']
    # )
    train_texts, train_labels = train_df['text'], train_df['label'].astype(int)
    train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})

    test_df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/test_data_new.csv')
    test_texts, test_labels = test_df['text'], test_df['label'].astype(int)
    test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})
    return train_data, test_data

# Prepare data for both datasets
train_combined, test_combined = prepare_data(combined_df)

# Initialize the RoBERTa tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize datasets

train_combined = train_combined.map(tokenize_function, batched=True)
test_combined = test_combined.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=[],  # Disable all loggers like wandb
)

def train_and_get_metrics(train_data, test_data):
    """Train the model and return F1, precision, and recall for each class."""
    # Reinitialize the model for each training run
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )
    trainer.train()
    predictions = trainer.predict(test_data)
    preds = predictions.predictions.argmax(axis=1)

  # Generate a classification report
    report = classification_report(
        test_data['label'], preds, target_names=[f"Class {i}" for i in range(4)], output_dict=True
    )

    # Extract accuracy
    accuracy = report['accuracy']

    # Extract precision, recall, and F1 for each class
    metrics_per_class = {f"Class {i}": {"precision": report[f"Class {i}"]['precision'],
                                        "recall": report[f"Class {i}"]['recall'],
                                        "f1-score": report[f"Class {i}"]['f1-score']}
                         for i in range(4)}

    # Include overall accuracy
    metrics_per_class['accuracy'] = accuracy

    return metrics_per_class



metrics_augmented = train_and_get_metrics(train_combined, test_combined)


print("\nMetrics on Augmented Dataset:")
print(metrics_augmented)


Map:   0%|          | 0/12671 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.287,0.233326
2,0.1772,0.259287
3,0.1182,0.235201



Metrics on Augmented Dataset:
{'Class 0': {'precision': 0.9787581699346405, 'recall': 0.9983333333333333, 'f1-score': 0.9884488448844885}, 'Class 1': {'precision': 0.9720853858784894, 'recall': 0.9866666666666667, 'f1-score': 0.9793217535153019}, 'Class 2': {'precision': 0.8254437869822485, 'recall': 0.93, 'f1-score': 0.8746081504702194}, 'Class 3': {'precision': 0.9423459244532804, 'recall': 0.79, 'f1-score': 0.8594741613780599}, 'accuracy': 0.92625}


In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced_test1.csv')

# Define the target labels and the number of samples to extract for testing
labels = [0, 1, 2, 3]
test_samples_per_class = 600

# Create test and train dataframes
test_data = pd.DataFrame()
train_data = pd.DataFrame()

# Loop through each label
for label in labels:


    # Filter articles of the current label
    label_data = df[df['label'] == label]
    if label == 0:
      world_df = pd.read_csv("/content/drive/Shareddrives/682_Drive/682-Project/augmented_articles_diff_tones_world.csv")
      test_subset = world_df.sample(n=600, random_state=42)
      train_subset = label_data
    else:
      # Randomly sample 600 articles for test
      test_subset = label_data.sample(n=test_samples_per_class, random_state=42)

      # Get the remaining articles for training
      train_subset = label_data.drop(test_subset.index)

    # Append to test and train dataframes
    test_data = pd.concat([test_data, test_subset], ignore_index=True)
    train_data = pd.concat([train_data, train_subset], ignore_index=True)


# Save or use the test and train datasets
test_data = test_data.sample(frac=1, random_state=42).reset_index(drop=True)
test_data.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/test_data_new.csv', index=False)
train_data.to_csv('/content/drive/Shareddrives/682_Drive/682-Project/train_data_new.csv', index=False)

print("Test and train datasets created successfully!")


Test and train datasets created successfully!
