In [None]:
from google.colab import drive
drive.mount('/content/drive')
drive_folder = '/content/drive/Shareddrives/682_Drive'
# Adjust this line to be the assignment1 folder in your google drive
notebook_folder = drive_folder + '/682-Project'
%cd {notebook_folder}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import random
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
import requests
from tqdm import tqdm

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
!pip install bitsandbytes
# !pip install -U bitsandbytes
!pip install -U transformers accelerate
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoConfig

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,  # or load_in_4bit=True if you want 4-bit
)
HF_token= '<your-HF-token>'
# Load a tokenizer and model for Llama (replace with actual Llama-3.2 model path if needed)
model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct'  # Use the specific Llama version you have
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_token)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, token=HF_token)


# Load the configuration
config = AutoConfig.from_pretrained(model_name, token=HF_token)

print("Model loaded successfully!")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

In [None]:
# import torch

# def generate_augmented_samples(seed_text, model, tokenizer, num_samples=1):
#     prompt = (
#     """Generate 5 unique and original news articles for the 'World News' category.
#     Each article should:
#     - Be 3-5 sentences long.
#     - Focus on global events similar to the provided example.
#     - Avoid repeating content or phrases from the example."""

#     "Output in the format:"
#     "Article 1: <text> Article 2: <text> Article 3: <text> Article 4: <text> Article 5: <text>\n \n"
#         f"Example Text: \"{seed_text}\""
#     )
#     # Tokenize the input prompt
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
#     all_generated_texts = []

#     # Generate text one sample at a time
#     for _ in range(num_samples):
#         output = model.generate(
#             **inputs,
#             max_new_tokens=300,  # Adjust max tokens per article
#             temperature=0.8,
#             top_k=50,
#             top_p=0.9,
#             do_sample=True,  # Enable sampling for diversity
#         )

#         # Decode generated tokens and append to results
#         generated_text = tokenizer.decode(output[0], skip_special_tokens=True)  # Decode single output
#         print(generated_text)  # Optional: Print the output
#         all_generated_texts.append(generated_text)

#     return all_generated_texts


In [None]:

# Function to generate augmented samples using LLM
def generate_augmented_samples(seed_text, model, tokenizer, num_articles=5):

    if tokenizer.pad_token is None:
          if tokenizer.eos_token is not None:
              tokenizer.pad_token = tokenizer.eos_token
          else:
              tokenizer.add_special_tokens({'pad_token': '[PAD]'})
              model.resize_token_embeddings(len(tokenizer))  # Adjust model embeddings for the new token

        # Create prompt based on the seed text
    prompt = (
          f"Generate {num_articles} original news articles for the 'World News' category, "
          f"using the following example as inspiration. \n\n"
          f"Example: {seed_text}\n\n"
          "Now, write the new articles below:\n\n"
          "Article 1:\n[Start writing here...]\n\n"
          "Article 2:\n[Start writing here...]\n\n"
          "Article 3:\n[Start writing here...]\n\n"
          "Article 4:\n[Start writing here...]\n\n"
          "Article 5:\n[Start writing here...]\n"
      )
    with torch.no_grad():
      # Generate text using the model
      inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
      outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.7, do_sample=True, top_k=40, top_p=0.9)
      generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_text)
    # Extract individual articles from generated text
    articles = generated_text.split("Article")[1:]  # Splitting by "Article" keyword
    articles = [article.split("\n", 1)[-1].strip() for article in articles]  # Clean up the text
    print(f"Printing articles ----")
    print(articles)
    return articles[:num_articles]  # Return only the required number of articles


In [None]:
import os
import pandas as pd
from tqdm import tqdm



# File paths
input_file_path = '/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced.csv'
random_samples_path = '/content/drive/Shareddrives/682_Drive/682-Project/random_samples_new_approach.csv'
augmented_samples_path = '/content/drive/Shareddrives/682_Drive/682-Project/augmented_samples_new_approach.csv'
output_file_path = '/content/drive/Shareddrives/682_Drive/682-Project/ag_news_with_LLM_augmented_world_new_approach.csv'

# Load the dataset
imbalanced_dataset = pd.read_csv(input_file_path)
world_data = imbalanced_dataset[imbalanced_dataset['label'] == 0]

# Step 1: Randomly select 10 samples from "World News" category
random_samples = world_data.sample(n=1, random_state=42)
random_samples.to_csv(random_samples_path, index=False)  # Save selected samples to CSV
print(f"Randomly selected 10 samples saved to {random_samples_path}.")

# Initialize variables
all_augmented_texts = []

# Step 2: Generate 5 articles for each random sample
for seed_text in tqdm(random_samples['text'], desc="Generating augmented articles"):
    augmented_texts = generate_augmented_samples(seed_text, model, tokenizer)
    all_augmented_texts.extend(augmented_texts)

# Step 3: Save augmented articles to a CSV file
augmented_df = pd.DataFrame({'text': all_augmented_texts, 'label': 0})
augmented_df.to_csv(augmented_samples_path, index=False)
print(f"Augmented articles saved to {augmented_samples_path}.")

# Step 4: Combine augmented data with the original dataset
combined_world_data = pd.concat([world_data, augmented_df]).reset_index(drop=True)
combined_world_data.to_csv(output_file_path, index=False)
print(f"Combined dataset with augmented data saved to {output_file_path}.")

# Step 5: Merge with other categories and shuffle the dataset
sports_data = imbalanced_dataset[imbalanced_dataset['label'] == 1]
business_data = imbalanced_dataset[imbalanced_dataset['label'] == 2]
sci_tech_data = imbalanced_dataset[imbalanced_dataset['label'] == 3]

final_data = pd.concat([combined_world_data, sports_data, business_data, sci_tech_data]).sample(frac=1, random_state=42).reset_index(drop=True)
final_data.to_csv(output_file_path, index=False)

# Display class distribution
print("Class distribution in the augmented dataset:")
print(final_data['label'].value_counts())
