In [None]:
from google.colab import drive
drive.mount('/content/drive')
drive_folder = '/content/drive/Shareddrives/682_Drive'
# Adjust this line to be the assignment1 folder in your google drive
notebook_folder = drive_folder + '/682-Project'
%cd {notebook_folder}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os
import requests
from tqdm import tqdm

# Hugging Face API key and model
HF_API_KEY = '<your_hf_api_key>'
MODEL_ID = "mistralai/Mistral-7B-v0.1"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"

# File paths
input_file_path = '/content/drive/Shareddrives/682_Drive/682-Project/ag_news_train_imbalanced.csv'
output_file_path = '/content/drive/Shareddrives/682_Drive/682-Project/ag_news_with_LLM_augmented_world_new_approach.csv'
random_samples_path = '/content/drive/Shareddrives/682_Drive/682-Project/random_samples_new_approach.csv'
augmented_samples_path = '/content/drive/Shareddrives/682_Drive/682-Project/augmented_samples_new_approach.csv'

# Load the dataset
imbalanced_dataset = pd.read_csv(input_file_path)
world_data = imbalanced_dataset[imbalanced_dataset['label'] == 0]

# Set the number of augmented samples to generate
total_augmented_samples = 20
batch_size = 10
num_batches = total_augmented_samples // batch_size

# Step 1: Randomly select samples from the "World News" class
random_samples = world_data.sample(n=5, random_state=42)['text'].tolist()


base_prompt = (
    "Generate 10 new detailed, unique, and high-quality news articles for the 'World News' category which are similar to the below samples. "

)

for i, sample in enumerate(random_samples):
    base_prompt += f"Example {i+1}: {sample}\n\n"

print(base_prompt )
#base_prompt += (
    #"\n Generate articles after 'output' \nOutput:\n"
    #"Now, write 10 new articles in the same style and structure as the examples. Each article should be between 3 to 5 sentences and focus on recent global events. "
    # "Clearly separate the articles as follows:\n\n"
    # "Article 1:\n[Start writing here...]\n\n"
    # "Article 2:\n[Start writing here...]\n\n"
    # "Article 3:\n[Start writing here...]\n\n"
    # "Article 4:\n[Start writing here...]\n\n"
    # "Article 5:\n[Start writing here...]\n\n"
    # "Article 6:\n[Start writing here...]\n\n"
    # "Article 7:\n[Start writing here...]\n\n"
    # "Article 8:\n[Start writing here...]\n\n"
    # "Article 9:\n[Start writing here...]\n\n"
    # "Article 10:\n[Start writing here...]\n\n"
#)

# Processing Generated Output
def split_articles(generated_texts):
    articles = []
    for text in generated_texts:
        parts = text.split("Example")
        for part in parts:
            article = part.strip()
            if article and ":" not in article:
                articles.append(article)
    return articles

#print(base_prompt)
# Function to call the Hugging Face API for text generation
def generate_augmented_samples_with_flan_t5(prompt, num_samples, batch_size):
    all_generated_texts = []
    headers = {"Authorization": f"Bearer {HF_API_KEY}"}


    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 300,  # Adjust max tokens per article
            "temperature": 0.8,
            "top_k": 50,
            "top_p": 0.9,
        },
    }
    response = requests.post(API_URL, json=payload, headers=headers)


    if response.status_code == 200:
        output_text = response.json()[0]['generated_text']
        print(output_text)
        all_generated_texts.append(output_text)
    else:
        print(f"Error: {response.status_code}, {response.text}")

    return all_generated_texts

# Step 3: Generate augmented texts in batches
all_augmented_texts = []
for batch_num in tqdm(range(num_batches), desc=f"Generating batches with {MODEL_ID}"):
    batch_augmented_texts = generate_augmented_samples_with_flan_t5(base_prompt, num_samples=batch_size, batch_size=batch_size)
    all_augmented_texts.extend(batch_augmented_texts)


pd.DataFrame({'text': random_samples}).to_csv(random_samples_path, index=False)

# Split the generated text into individual articles
augmented_articles = split_articles(all_augmented_texts)

# Step 5: Save augmented samples in CSV
augmented_df = pd.DataFrame({'text': augmented_articles[:total_augmented_samples], 'label': 0})
augmented_df.to_csv(augmented_samples_path, index=False)

# Step 6: Append the augmented texts to the original dataset
combined_world_data = pd.concat([world_data, augmented_df]).reset_index(drop=True)
combined_world_data.to_csv(output_file_path, mode='a', header=not os.path.exists(output_file_path), index=False)

print(f"Successfully augmented {len(augmented_articles[:total_augmented_samples])} texts for the 'World News' class.")

# Step 7: Load the augmented data and combine it with other classes
augmented_world_data = pd.read_csv(output_file_path)
sports_data = imbalanced_dataset[imbalanced_dataset['label'] == 1]
business_data = imbalanced_dataset[imbalanced_dataset['label'] == 2]
sci_tech_data = imbalanced_dataset[imbalanced_dataset['label'] == 3]

# Combine and shuffle the dataset, then save to CSV
final_data = pd.concat([augmented_world_data, sports_data, business_data, sci_tech_data]).sample(frac=1, random_state=42).reset_index(drop=True)
final_data.to_csv(output_file_path, index=False)

print("Class distribution in the augmented dataset:")
print(final_data['label'].value_counts())
