<a href="https://www.kaggle.com/code/emrekaany/data-generating-with-gemini-for-sentiment-model?scriptVersionId=236212353" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
pip install google-generativeai pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [3]:
# -*- coding: utf-8 -*-
"""
Generates a financial sentiment dataset using the Gemini API
and merges it with an existing dataset.

This script prompts a generative model (like Gemini) to create longer text examples
(aiming for >50 words) for predefined financial sentiment labels and merges them
with data from an existing CSV file before saving.
"""

import google.generativeai as genai
import pandas as pd
import os
import time
import logging
from tqdm import tqdm  # Optional: for progress bar (pip install tqdm)
from kaggle_secrets import UserSecretsClient


# --- Configuration ---
# Define the sentiment labels you want to generate data for
SENTIMENT_LABELS = ["strong buy", "buy", "hold", "sell", "strong sell"]

# Number of LONG examples to generate for each label
EXAMPLES_PER_LABEL = 10 # Reduce this number as long comments take more time/tokens

# Model name (check availability in your region/API access)
MODEL_NAME = 'gemini-1.5-flash' # Or 'gemini-pro', 'gemini-1.0-pro', etc.

# --- File Paths ---
# Path to the existing dataset generated previously
EXISTING_DATA_FILE = 'merged_financial_sentiment_dataset.csv'
# Output filename for the MERGED dataset
OUTPUT_CSV_FILE = 'merged_financial_sentiment_dataset.csv'

# Delay between API calls (in seconds) to respect rate limits if necessary
API_CALL_DELAY = 2 # Increase delay slightly for potentially longer generation times

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GEMINI_API_KEY")
api_key = secret_value_0  

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Helper Function (Modified for Longer Comments) ---
def generate_long_examples_for_label(model, label, num_examples):
    """
    Generates LONGER text examples for a given sentiment label using the generative model.

    Args:
        model: The configured generative model instance.
        label (str): The sentiment label (e.g., "buy", "hold").
        num_examples (int): The number of examples to request.

    Returns:
        list: A list of generated text examples (strings), or None if generation fails.
    """
    # Modified prompt to ask for longer comments (aiming for > 50 words)
    prompt = f"""
    Generate {num_examples} diverse and realistic **detailed comments or short paragraphs**, each ideally **between 50 and 100 words**, that clearly express a '{label}' sentiment towards a stock, investment, or financial asset.
    Focus on language typically used by financial analysts, traders, or investors in reports, detailed commentary, or investment forums, going beyond simple short phrases.
    Provide reasoning or context within the comment where appropriate for the sentiment.
    Each comment/paragraph should be separated by a blank line. Do not include bullet points or numbering before each comment.

    Example structure for '{label}':
    - If '{label}' is 'strong buy': Start with a strong positive assertion, mention key drivers like earnings growth, market position, or valuation, and conclude with a confident outlook. Aim for 50+ words.
    - If '{label}' is 'hold': Express a neutral or cautious stance, perhaps citing balanced risk/reward, waiting for specific catalysts or data, or fair valuation. Explain why neither buying more nor selling is advised currently. Aim for 50+ words.
    - If '{label}' is 'sell': Clearly state the negative outlook, provide reasons such as declining fundamentals, competitive threats, overvaluation, or macroeconomic headwinds, and suggest exiting the position. Aim for 50+ words.

    Generate {num_examples} detailed examples for '{label}':
    """

    try:
        logging.info(f"Generating {num_examples} LONG examples for label: '{label}'...")
        # Adjust generation config if needed (e.g., potentially increase max_output_tokens)
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                temperature=0.75, # Slightly higher temp might help with longer, creative text
                # max_output_tokens=2048 # Increase if needed for longer responses
            ),
            # safety_settings=[ ... ] # Add safety settings if needed
        )

        # Check if the response has text content
        if hasattr(response, 'text'):
            generated_text = response.text
            # Split by double newline, assuming paragraphs are separated by blank lines
            # Also handle single newlines just in case. Trim whitespace.
            examples = [p.strip() for p in generated_text.split('\n\n') if p.strip()]
            if not examples or len(examples) < num_examples / 2: # Basic check if splitting failed
                 # Fallback split by single newline if double didn't work well
                 examples = [line.strip() for line in generated_text.split('\n') if line.strip()]

            # Basic length check (optional filter)
            # examples = [ex for ex in examples if len(ex.split()) > 40] # Keep if > 40 words

            logging.info(f"Successfully generated {len(examples)} raw examples for '{label}'.")
            return examples
        elif response.prompt_feedback.block_reason:
             logging.error(f"API call blocked for label '{label}'. Reason: {response.prompt_feedback.block_reason}")
             return None
        else:
            logging.warning(f"Received an empty or unexpected response for label '{label}'. Response parts: {response.parts}")
            return None

    except Exception as e:
        logging.error(f"An error occurred while generating examples for label '{label}': {e}")
        return None


logging.info("--- Starting Financial Sentiment Dataset Generation (Long Comments & Merge) ---")

# --- Load Existing Data ---
old_df = pd.read_csv("/kaggle/input/financial-comments-for-sentiment-analysis/merged_financial_sentiment_dataset.csv") # Initialize empty DataFrame
if os.path.exists(EXISTING_DATA_FILE):
    try:
        logging.info(f"Loading existing dataset from: {EXISTING_DATA_FILE}")
        old_df = pd.read_csv(EXISTING_DATA_FILE)
        # Basic validation
        if 'text' not in old_df.columns or 'label' not in old_df.columns:
            logging.warning(f"Existing dataset '{EXISTING_DATA_FILE}' is missing 'text' or 'label' columns. It will be ignored.")
            old_df = pd.DataFrame() # Reset if columns are wrong
        else:
             logging.info(f"Loaded {len(old_df)} records from existing dataset.")
             # Optional: Drop duplicates from old data before merging
             old_df.drop_duplicates(subset=['text', 'label'], inplace=True)
             logging.info(f"{len(old_df)} unique records remaining after checking old data.")

    except Exception as e:
        logging.error(f"Error loading existing dataset '{EXISTING_DATA_FILE}': {e}. Proceeding without it.")
        old_df = pd.DataFrame() # Ensure it's empty if loading fails
else:
    logging.warning(f"Existing dataset file '{EXISTING_DATA_FILE}' not found. Only new data will be generated.")

# --- Configure API ---
if not api_key:
    # ** CRITICAL SECURITY WARNING **
    logging.error("CRITICAL: API key is missing or empty. Using hardcoded keys is insecure.")
    print("\nCRITICAL SECURITY WARNING:")
    print("No API key found or provided key is empty.")
    print("Using hardcoded API keys is a major security risk.")
    print("Please configure the API key securely (e.g., using environment variables) and restart.")
    # Optionally, you might want to exit here if the key is truly missing
    # return
    # For demonstration, allowing continuation if key was hardcoded above
    # but strongly advising against it.

try:
    genai.configure(api_key=api_key)
    logging.info("Gemini API key configured. (WARNING: Ensure key is handled securely)")
except Exception as e:
    logging.error(f"Error configuring Gemini API: {e}")
    

# --- Initialize the Generative Model ---
try:
    model = genai.GenerativeModel(MODEL_NAME)
    logging.info(f"Generative model '{MODEL_NAME}' initialized.")
except Exception as e:
    logging.error(f"Error initializing model '{MODEL_NAME}': {e}")
    print(f"\nCould not initialize model '{MODEL_NAME}'. Check if the model name is correct and available.")
    

# --- Generate New LONG Data ---
new_long_comments_data = []
total_labels = len(SENTIMENT_LABELS)
logging.info(f"Generating {EXAMPLES_PER_LABEL} LONG comments for each of {total_labels} labels: {', '.join(SENTIMENT_LABELS)}")

label_iterator = tqdm(SENTIMENT_LABELS, desc="Generating Long Comments") if 'tqdm' in globals() else SENTIMENT_LABELS

for i, label in enumerate(label_iterator):
    # Use the modified function for long examples
    generated_examples = generate_long_examples_for_label(model, label, EXAMPLES_PER_LABEL)

    if generated_examples:
        for example in generated_examples:
            # Optional: Add another length filter here if needed
            # if len(example.split()) > 50:
            new_long_comments_data.append({'text': example, 'label': label})
    else:
        logging.warning(f"Skipping label '{label}' for long comments due to generation failure or empty response.")

    # Add delay between API calls
    if i < total_labels - 1:
        logging.debug(f"Waiting for {API_CALL_DELAY} seconds before next API call...")
        time.sleep(API_CALL_DELAY)

if not new_long_comments_data:
    logging.warning("No new long comments were generated.")
    if old_df.empty:
         logging.error("No existing data loaded and no new data generated. Exiting.")
         print("\nFailed to generate any new data and no existing data found. Cannot create merged file.")
         
    else:
         # If only old data exists, maybe just save that? Or exit?
         # For now, let's proceed to save only the old data if new generation failed.
         logging.warning("Proceeding with only the previously loaded data.")
         merged_df = old_df
else:
    logging.info(f"Total new long comments generated: {len(new_long_comments_data)}")
    new_df = pd.DataFrame(new_long_comments_data)

    # Optional: Clean new data (remove duplicates within the new data)
    new_df.drop_duplicates(subset=['text', 'label'], inplace=True)
    logging.info(f"{len(new_df)} unique new long comments generated.")


    # --- Merge DataFrames ---
    logging.info("Merging existing data with newly generated long comments...")
    df = pd.concat([old_df, new_df], ignore_index=True)
    logging.info(f"Total records before final duplicate check: {len(df)}")

    # Final check for duplicates across the entire merged dataset
    initial_merged_rows = len(df)
    df.drop_duplicates(subset=['text', 'label'], inplace=True)
    duplicates_removed = initial_merged_rows - len(df)
    if duplicates_removed > 0:
        logging.info(f"Removed {duplicates_removed} duplicate entries from the final merged dataset.")

# --- Save Merged DataFrame to CSV ---
if df.empty:
    logging.error("Merged dataset is empty. Nothing to save.")
    print("\nResulting dataset is empty. No file will be saved.")
    

logging.info(f"Final merged dataset contains {len(df)} records.")
try:
    df.to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8')
    logging.info(f"Merged dataset successfully saved to '{OUTPUT_CSV_FILE}'")
    print(f"\nMerged dataset saved to {OUTPUT_CSV_FILE}")
    print(f"Total rows: {len(df)}")
    print("\nSample data (last 5 rows might include new long comments):")
    print(df.tail()) # Show tail to potentially see new comments
    print("\nLabel distribution (merged):")
    print(df['label'].value_counts())
except Exception as e:
    logging.error(f"Error saving merged DataFrame to CSV: {e}")
    print(f"\nError saving the merged dataset to {OUTPUT_CSV_FILE}. Check permissions or disk space.")

logging.info("--- Dataset Generation Finished ---")




Generating Long Comments: 100%|██████████| 5/5 [00:32<00:00,  6.40s/it]


Merged dataset saved to merged_financial_sentiment_dataset.csv
Total rows: 367

Sample data (last 5 rows might include new long comments):
                                                  text        label
362  Epsilon Inc.'s recent product recall, coupled ...  strong sell
363  Zeta Holdings’ failure to meet its Q4 revenue ...  strong sell
364  The emergence of a disruptive technology from ...  strong sell
365  Our analysis indicates that Omega Pharmaceutic...  strong sell
366  Based on our proprietary valuation model, whic...  strong sell

Label distribution (merged):
label
sell           76
strong buy     74
buy            74
hold           73
strong sell    70
Name: count, dtype: int64



