In [1]:
pip install google-generativeai pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install Faker

Collecting Faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-37.1.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import os
import time # Import time for potential delays/retries
import json # Import JSON for parsing API response
# --- Import Kaggle Secrets ---
try:
    from kaggle_secrets import UserSecretsClient
    print("Imported UserSecretsClient from kaggle_secrets.")
except ImportError:
    print("Warning: kaggle_secrets not found. API key retrieval might fail if not set otherwise.")
    UserSecretsClient = None # Define as None if import fails

# --- Try importing the Google Generative AI library ---
try:
    import google.generativeai as genai
    # from google.generativeai.types import GenerationConfig # For explicit JSON config if needed
    print("Successfully imported google.generativeai.")
except ImportError:
    print("ERROR: google-generativeai library not found.")
    print("Please install it: pip install google-generativeai")
    # Exit or raise an error if the library is essential and not found
    raise ImportError("google-generativeai is required for this script.")


# Initialize Faker for generating fake data (used as fallback)
fake = Faker()

# --- Configuration ---
NUM_REVIEWS = 400 # Updated number of reviews
# Ensure NUM_CUSTOMERS is an integer
NUM_CUSTOMERS = int(NUM_REVIEWS * 0.9) # Assume reviews come from a pool of customers, cast to int
START_DATE = datetime(2022, 1, 1) # Adjusted start date
END_DATE = datetime.now() # Use current date as end date
print(f"Generating {NUM_REVIEWS} reviews from a pool of {NUM_CUSTOMERS} customers.")

# --- Gemini API Configuration ---
# IMPORTANT: Store your API key securely! Use environment variables or Kaggle Secrets.
api_key = None
try:
    if UserSecretsClient: # Check if Kaggle Secrets client is available
        api_key = UserSecretsClient().get_secret("GEMINI_API_KEY_INSURANCE_REVIEW")
        print("Attempting to retrieve GEMINI_API_KEY from Kaggle Secrets.")
    else:
        # Fallback to environment variable if Kaggle Secrets not available
        api_key = os.getenv("GEMINI_API_KEY_INSURANCE_REVIEW")
        if api_key:
            print("Retrieved GEMINI_API_KEY from environment variable.")

    if not api_key:
        # Last resort: Placeholder (Use with caution, only for testing)
        api_key = "YOUR_API_KEY" # <--- PASTE YOUR KEY HERE ONLY FOR TESTING
        if api_key == "YOUR_API_KEY":
             print("WARNING: Using placeholder API key. Set the GEMINI_API_KEY environment variable or Kaggle Secret.")
             # raise ValueError("Gemini API Key not configured.") # Optional: uncomment to enforce key presence

    genai.configure(api_key=api_key)
    # Select the model - ensure it's good at following instructions like JSON output
    # gemini-1.5-flash is often good for structured output and speed.
    model = genai.GenerativeModel('gemini-1.5-flash') # Model being used

    # Optional: Configure for JSON output explicitly if the model supports it well
    # generation_config = GenerationConfig(response_mime_type="application/json")
    # model = genai.GenerativeModel('gemini-1.5-flash', generation_config=generation_config) # Pass config if using

    print("Gemini API configured successfully.")

except Exception as e:
    print(f"ERROR: Failed to configure Gemini API: {e}")
    print("Text generation will fall back to using Faker templates.")
    model = None # Ensure model is None if configuration fails

# --- Helper Functions ---

def random_date(start, end):
  """Generate a random datetime between start and end."""
  return start + timedelta(
      seconds=random.randint(0, int((end - start).total_seconds())),
  )

def generate_fallback_details():
    """Generates fallback rating, sentiment, text, and task using Faker."""
    rating = random.randint(1, 5)
    sentiment = None
    text = ""
    task = "Log review." # Default task

    if rating >= 4:
        sentiment = 'Positive'
        text = fake.sentence(nb_words=random.randint(10, 35)) + " " + random.choice([
            "Amazing service!", "Very satisfied.", "Exceeded expectations.", "Highly recommend.", "Smooth process."
        ])
        task = random.choice(["Send thank you note.", "Request testimonial.", "Log positive feedback."])
    elif rating <= 2:
        sentiment = 'Negative'
        text = fake.sentence(nb_words=random.randint(10, 35)) + " " + random.choice([
            "Very disappointed.", "Difficult process.", "Poor communication.", "Would not recommend.", "Problem not solved."
        ])
        task = random.choice(["Escalate to manager.", "Schedule follow-up call.", "Offer resolution.", "Investigate complaint."])
    else: # Rating is 3
        sentiment = 'Neutral'
        text = fake.sentence(nb_words=random.randint(10, 35)) + " " + random.choice([
            "It was adequate.", "Service was average.", "Met basic expectations.", "Neither good nor bad.", "Okay experience."
        ])
        task = random.choice(["Monitor account.", "Check for potential issues.", "Log neutral feedback."])

    return rating, sentiment, text, task

def generate_review_details_with_gemini():
    """Generates rating, sentiment, review text, and task using Gemini API (with fallback)."""
    # Generate a random scenario type
    scenario_type = random.choice(['Positive', 'Negative', 'Neutral'])
    base_rating = {'Positive': random.randint(4, 5), 'Negative': random.randint(1, 2), 'Neutral': 3}[scenario_type]

    # --- Comprehensive Details for the prompt based on scenario ---
    if scenario_type == 'Positive':
        prompt_detail = random.choice([
            "a very smooth and quick claim process after a minor car accident",
            "excellent customer support that patiently answered all complex questions about coverage",
            "an easy and straightforward online renewal process with a discount applied automatically",
            "getting helpful, personalized advice that significantly saved money on the annual premium",
            "a surprisingly fast response time when reporting storm damage to the house",
            "clear, proactive communication about upcoming policy changes and options",
            "a friendly and knowledgeable agent during the initial sign-up who explained everything clearly",
            "receiving a fair and prompt payout for a covered medical expense without excessive paperwork",
            "the mobile app being user-friendly for accessing documents and making payments",
            "a loyalty discount being offered without needing to ask"
        ])
    elif scenario_type == 'Negative':
        prompt_detail = random.choice([
            "a confusing claim denial for a seemingly covered event with unclear reasoning provided",
            "extremely long wait times (over an hour) when calling customer service for urgent help",
            "unclear policy documentation leading to misunderstandings about deductible amounts",
            "difficulty reaching a specific agent or department, being transferred multiple times",
            "unexpected premium increases at renewal without proper prior notification or explanation",
            "a slow, complicated, and bureaucratic claim process after significant water damage",
            "receiving conflicting information from different representatives about claim status",
            "feeling pressured into buying unnecessary add-ons during a sales call",
            "the website being down frequently when trying to access account information",
            "errors found on the billing statement that took weeks to resolve"
        ])
    else: # Neutral
        prompt_detail = random.choice([
            "an average interaction resolving a standard billing query over the phone",
            "a standard procedure follow-up call requesting necessary documentation for a policy",
            "neither quick nor slow service during a policy update request submitted online",
            "an expected outcome for a routine claim inquiry with no major issues",
            "the automated phone system being functional for basic tasks but impersonal",
            "receiving standard renewal paperwork and ID cards on time via mail",
            "the company website providing basic policy information adequately but lacking advanced features",
            "a payment transaction processed correctly through the online portal without notable interaction",
            "an agent providing standard information requested without going above or beyond",
            "completing an online form for a minor policy change successfully"
        ])

    # Default values in case API fails early
    rating = base_rating
    sentiment = scenario_type
    text = ""
    task = "Log review."
    generated_by_api = False

    # --- Generate Review Details using Gemini API ---
    if model: # Check if Gemini API was configured successfully
        # Construct the prompt requesting JSON output (Updated based on user request)
        prompt = f"""
        Imagine a customer is writing a review about their experience with our insurance company called Blue Insurance.
        The experience involved: {prompt_detail}.

        Generate a response in JSON format containing:
        1.  `review_text`: A customer review (15-500 words) reflecting this experience. Be descriptive.
        2.  `sentiment`: Classify the sentiment of YOUR generated review as exactly 'Positive', 'Negative', or 'Neutral'.
        3.  `suggested_task`: Suggest a brief, actionable CRM task based on YOUR generated review. Be creative (e.g., 'Send personalized thank you & loyalty offer', 'Assign senior agent to investigate claim issue & call customer', 'Call customer to clarify needs', 'Notify sales department of potential upsell opportunity', 'Log feedback for product team', 'Schedule proactive policy review call').

        JSON response format:
        {{
          "review_text": "...",
          "sentiment": "...",
          "suggested_task": "..."
        }}
        """

        try:
            # Add a delay based on RPM limit for gemini-1.5-flash (15 RPM -> 60/15 = 4s/request)
            # Setting slightly higher for safety margin.
            time.sleep(4.1) # **UPDATED DELAY**
            response = model.generate_content(prompt)

            # --- Parse the response ---
            # Attempt to extract and parse JSON from the response text
            # Responses might sometimes include markdown ```json ... ``` markers
            response_text = response.text.strip()
            json_start = response_text.find('{')
            json_end = response_text.rfind('}') + 1
            if json_start != -1 and json_end != -1:
                json_string = response_text[json_start:json_end]
                try:
                    data = json.loads(json_string)
                    # Validate expected keys
                    if all(k in data for k in ["review_text", "sentiment", "suggested_task"]):
                        text = data["review_text"].strip()
                        # Ensure sentiment is one of the expected values
                        api_sentiment = data["sentiment"].strip().capitalize()
                        if api_sentiment in ['Positive', 'Negative', 'Neutral']:
                             sentiment = api_sentiment
                        else:
                             print(f"Warning: API returned unexpected sentiment '{data['sentiment']}'. Using scenario type '{scenario_type}'.")
                             sentiment = scenario_type # Fallback sentiment

                        task = data["suggested_task"].strip()
                        rating = base_rating # Keep rating based on initial scenario for simplicity
                        generated_by_api = True
                        # print(f"API Success: Sentiment={sentiment}, Task={task}") # Optional debug print
                    else:
                         print(f"Warning: Gemini API response missing expected JSON keys. Response: {json_string}")
                except json.JSONDecodeError as json_e:
                    print(f"Warning: Failed to decode JSON from Gemini API response: {json_e}. Response: {response_text}")
            else:
                print(f"Warning: Could not find JSON object in Gemini API response. Response: {response_text}")

        except Exception as e:
            print(f"ERROR: Gemini API call failed: {e}. Falling back to Faker.")
            # Fallback will be handled below if generated_by_api is still False

    # --- Fallback to Faker if API failed or wasn't configured or parsing failed ---
    if not generated_by_api:
        print("Executing Fallback generation.")
        rating, sentiment, text, task = generate_fallback_details()
        generated_by_api = False # Ensure flag is False if fallback used

    # Return rating separately from the API-generated/fallback details
    return rating, sentiment, text, task, generated_by_api

# --- Generate Reviews Table ---
print(f"Generating {NUM_REVIEWS} Reviews data...")
reviews_data = []
review_ids = list(range(1, NUM_REVIEWS + 1))
# Ensure customer_ids_pool uses the integer NUM_CUSTOMERS
customer_ids_pool = list(range(1, NUM_CUSTOMERS + 1))
api_success_count = 0
api_fail_count = 0

for rid in review_ids:
    review_date = random_date(START_DATE, END_DATE)
    # Ensure customer ID is valid if NUM_CUSTOMERS is 0 or negative (edge case)
    customer_id = random.choice(customer_ids_pool) if customer_ids_pool else 1

    # Use the function that calls the Gemini API
    rating, sentiment, review_text, generated_task, api_generated = generate_review_details_with_gemini()

    if api_generated:
        api_success_count += 1
    else:
        api_fail_count += 1

    reviews_data.append({
        'ReviewID': rid,
        'CustomerID': customer_id,
        'ReviewDate': review_date,
        'Rating': rating, # Using the base rating associated with the scenario
        'ReviewText': review_text,
        'Sentiment': sentiment, # Now potentially determined by API
        'GeneratedTask': generated_task, # Now potentially determined by API
        'AIGenerated': api_generated # Updated column name
    })

    # Print progress update periodically
    if rid % 50 == 0 or rid == NUM_REVIEWS: # Update less frequently for larger runs
        print(f"Generated {rid}/{NUM_REVIEWS} reviews... (API Success: {api_success_count}, Fallback: {api_fail_count})")


reviews_df = pd.DataFrame(reviews_data)
print(f"\nFinished generating {len(reviews_df)} reviews.")
print(f"Successfully generated by API: {api_success_count}")
print(f"Failed or fell back to Faker: {api_fail_count}")


# --- Final Adjustments ---
reviews_df['ReviewDate'] = pd.to_datetime(reviews_df['ReviewDate'])

# Display sample data
print("\n--- Sample Reviews Data ---")
print(reviews_df.head())
print("\n--- Data Types ---")
print(reviews_df.info())
print("\n--- Sentiment Distribution ---")
print(reviews_df['Sentiment'].value_counts())
print("\n--- Sample Tasks ---")
print(reviews_df['GeneratedTask'].value_counts().head(10))


# --- Save to CSV ---
# Saving the dataframe to a CSV file (Uncommented as per user update)
reviews_df.to_csv('insurance_customer_reviews_gemini_enhanced.csv', index=False)
print("\nData saved to insurance_customer_reviews_gemini_enhanced.csv")



Imported UserSecretsClient from kaggle_secrets.
Successfully imported google.generativeai.
Generating 400 reviews from a pool of 360 customers.
Attempting to retrieve GEMINI_API_KEY from Kaggle Secrets.
Gemini API configured successfully.
Generating 400 Reviews data...
Generated 50/400 reviews... (API Success: 50, Fallback: 0)
Generated 100/400 reviews... (API Success: 100, Fallback: 0)
Generated 150/400 reviews... (API Success: 150, Fallback: 0)
Generated 200/400 reviews... (API Success: 200, Fallback: 0)
Generated 250/400 reviews... (API Success: 250, Fallback: 0)
Generated 300/400 reviews... (API Success: 300, Fallback: 0)
Generated 350/400 reviews... (API Success: 350, Fallback: 0)
Generated 400/400 reviews... (API Success: 400, Fallback: 0)

Finished generating 400 reviews.
Successfully generated by API: 400
Failed or fell back to Faker: 0

--- Sample Reviews Data ---
   ReviewID  CustomerID          ReviewDate  Rating  \
0         1         108 2022-06-11 09:00:45       5   
1   