In [20]:
# Import required libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import google.generativeai as genai
import os
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# NOTE: Run this section only once to download NLTK datasets
# After running once successfully, you can comment out the section below
"""
# Set the directory for NLTK data
nltk_data_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')), 'nltk_data')
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

# Configure NLTK data directory
nltk.data.path.insert(0, nltk_data_dir)

# Download required NLTK datasets
print("Downloading NLTK datasets...")
for dataset in ['punkt', 'stopwords', 'wordnet', 'omw-1.4']:
    print(f"Downloading {dataset}...")
    nltk.download(dataset, download_dir=nltk_data_dir, quiet=True)
    print(f"{dataset} downloaded.")

# Verify downloaded datasets
print("\nVerifying NLTK datasets...")
try:
    # Test dataset usage
    word_tokenize("Test sentence")
    stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize("testing")
    print("All NLTK datasets successfully loaded and tested.")
except LookupError as e:
    print(f"Error: {str(e)}")
    print("An error occurred while loading NLTK datasets.")
"""

'\n# Set the directory for NLTK data\nnltk_data_dir = os.path.join(os.path.dirname(os.path.abspath(\'__file__\')), \'nltk_data\')\nif not os.path.exists(nltk_data_dir):\n    os.makedirs(nltk_data_dir)\n\n# Configure NLTK data directory\nnltk.data.path.insert(0, nltk_data_dir)\n\n# Download required NLTK datasets\nprint("Downloading NLTK datasets...")\nfor dataset in [\'punkt\', \'stopwords\', \'wordnet\', \'omw-1.4\']:\n    print(f"Downloading {dataset}...")\n    nltk.download(dataset, download_dir=nltk_data_dir, quiet=True)\n    print(f"{dataset} downloaded.")\n\n# Verify downloaded datasets\nprint("\nVerifying NLTK datasets...")\ntry:\n    # Test dataset usage\n    word_tokenize("Test sentence")\n    stopwords.words(\'english\')\n    lemmatizer = WordNetLemmatizer()\n    lemmatizer.lemmatize("testing")\n    print("All NLTK datasets successfully loaded and tested.")\nexcept LookupError as e:\n    print(f"Error: {str(e)}")\n    print("An error occurred while loading NLTK datasets.")\n'

## Loading and Exploring the Dataset

In [21]:
# Load the dataset
df = pd.read_csv('../data/sentiment_data.csv', header=None, names=['text', 'sentiment'])

print("Initial dataset size:", len(df))
df.head()

Initial dataset size: 5843


Unnamed: 0,text,sentiment
0,Sentence,Sentiment
1,The GeoSolutions technology will leverage Bene...,positive
2,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
3,"For the last quarter of 2010 , Componenta 's n...",positive
4,According to the Finnish-Russian Chamber of Co...,neutral


## Data Preparation
Apply the same preprocessing steps as in sentiment_analysis.ipynb

In [22]:
# Data cleaning and encoding: map sentiment labels directly to numeric values
def encode_sentiment(s):
    """
    Encode sentiment labels to numeric values for model training.
    
    Args:
        s (str): Input sentiment label
        
    Returns:
        int: Encoded sentiment value
            - 2: negative
            - 1: positive
            - 0: neutral
            - None: unknown values
    """
    s = str(s).lower().strip()
    if s in ['negative', 'neg', '-1']:
        return 2  # negative -> 2
    elif s in ['neutral', 'neu', '0']:
        return 0  # neutral -> 0
    elif s in ['positive', 'pos', '1']:
        return 1  # positive -> 1
    else:
        return None  # for unknown values

# Apply encoding
df['sentiment_encoded'] = df['sentiment'].apply(encode_sentiment)

In [23]:
# Remove rows with unknown sentiment (None values)
unknown_count = df['sentiment_encoded'].isna().sum()
if unknown_count > 0:
    print(f"Removing {unknown_count} records with unknown sentiment values")
    df = df.dropna(subset=['sentiment_encoded'])

print("Dataset size after removing unknown labels:", len(df))

# Convert encoded values to int type
df['sentiment_encoded'] = df['sentiment_encoded'].astype(int)

Removing 1 records with unknown sentiment values
Dataset size after removing unknown labels: 5842


In [24]:
# Remove duplicate entries based on text
duplicate_count = df.duplicated(subset=['text']).sum()
if duplicate_count > 0:
    print(f"Removing {duplicate_count} duplicate records")
    df = df.drop_duplicates(subset=['text'], keep='first')
    print("Dataset size after removing duplicates:", len(df))

Removing 520 duplicate records
Dataset size after removing duplicates: 5322


In [25]:
# Store the mapping for reference
sentiment_mapping = {
    'negative': 2,
    'neutral': 0,
    'positive': 1
}
print("\nSentiment Label Mapping:")
for sentiment, code in sentiment_mapping.items():
    print(f"{sentiment} -> {code}")


Sentiment Label Mapping:
negative -> 2
neutral -> 0
positive -> 1


In [26]:
# Display the distribution of sentiment classes
print("Distribution of sentiment classes:")
sentiment_counts_encoded = df['sentiment_encoded'].map({2: 'negative', 0: 'neutral', 1: 'positive'}).value_counts()
print(sentiment_counts_encoded)

Distribution of sentiment classes:
sentiment_encoded
neutral     2878
positive    1852
negative     592
Name: count, dtype: int64


In [27]:
# Display first few rows of the cleaned dataset
print(df[['text', 'sentiment', 'sentiment_encoded']].head())

                                                text sentiment  \
1  The GeoSolutions technology will leverage Bene...  positive   
2  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative   
3  For the last quarter of 2010 , Componenta 's n...  positive   
4  According to the Finnish-Russian Chamber of Co...   neutral   
5  The Swedish buyout firm has sold its remaining...   neutral   

   sentiment_encoded  
1                  1  
2                  2  
3                  1  
4                  0  
5                  0  


## Text Preprocessing
1. Convert to lowercase
2. Remove punctuation
3. Remove stop words
4. Apply lemmatization

In [28]:
def preprocess_text(text):
    """
    Preprocess text data by applying various cleaning and normalization steps.
    
    Args:
        text (str): Raw input text to be preprocessed
        
    Returns:
        str: Cleaned and normalized text with the following transformations:
            - Converted to lowercase
            - Removed URLs, email addresses, stock symbols
            - Removed percentages and currency amounts
            - Removed special characters and numbers
            - Tokenized and removed stop words
            - Applied lemmatization
            - Removed short words (length < 3)
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove stock symbols (e.g., $AAPL, $GOOG)
    text = re.sub(r'\$\w+', '', text)
    
    # Remove numbers with % (percentage)
    text = re.sub(r'\d+%', '', text)
    
    # Remove currency symbols and amounts (e.g., $123.45, ‚Ç¨100, ¬£50)
    text = re.sub(r'[$‚Ç¨¬£¬•]\d+(?:\.\d{2})?|\d+(?:\.\d{2})?[$‚Ç¨¬£¬•]', '', text)
    
    # Remove special characters and numbers
    # Keep alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization with pos tagging for better accuracy
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]  # First try as verb
    tokens = [lemmatizer.lemmatize(token, pos='n') for token in tokens]  # Then as noun
    
    # Remove short words (length < 3)
    tokens = [token for token in tokens if len(token) > 2]
    
    return ' '.join(tokens)

# Preprocess texts in the dataset
df['processed_text'] = df['text'].apply(preprocess_text)

print(df[['processed_text']].head())

                                      processed_text
1  geosolutions technology leverage benefon gps s...
2                               low real possibility
3  last quarter componenta net sale double eur eu...
4  accord finnish russian chamber commerce major ...
5  swedish buyout firm sell remain percent stake ...


In [29]:
# Check word count and filter
df['word_count'] = df['processed_text'].apply(lambda x: len(str(x).split()))
original_size = len(df)

# Remove examples with less than 4 words
df = df[df['word_count'] >= 4]

print(f"Original dataset size: {original_size}")
print(f"Filtered dataset size: {len(df)}")
print(f"Number of removed examples: {original_size - len(df)}")

# Display examples of processed texts with original for comparison
df_comparison = pd.DataFrame({
    'Original Text': df['text'],
    'Processed Text': df['processed_text'],
    'Word Count': df['word_count']
})
print(df_comparison.head())

Original dataset size: 5322
Filtered dataset size: 5080
Number of removed examples: 242
                                       Original Text  \
1  The GeoSolutions technology will leverage Bene...   
3  For the last quarter of 2010 , Componenta 's n...   
4  According to the Finnish-Russian Chamber of Co...   
5  The Swedish buyout firm has sold its remaining...   
6    $SPY wouldn't be surprised to see a green close   

                                      Processed Text  Word Count  
1  geosolutions technology leverage benefon gps s...          21  
3  last quarter componenta net sale double eur eu...          20  
4  accord finnish russian chamber commerce major ...          11  
5  swedish buyout firm sell remain percent stake ...          14  
6                           surprise see green close           4  


## Data Split
Perform the exact same train-test split as sentiment_analysis.ipynb for direct performance comparison

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'],  # Use raw text for LLM (no preprocessing needed)
    df['sentiment_encoded'],
    test_size=0.2, 
    random_state=9,  # Same random state as sentiment_analysis.ipynb
    stratify=df['sentiment_encoded']
)

print("Training set size:", len(X_train))
print("Test set size:", len(X_test))

print("\nClass distribution in test set:")
print(pd.Series(y_test).map({2: 'negative', 0: 'neutral', 1: 'positive'}).value_counts())

Training set size: 4064
Test set size: 1016

Class distribution in test set:
sentiment_encoded
neutral     554
positive    351
negative    111
Name: count, dtype: int64


In [31]:
# Create test dataframe for Gemini predictions
df_test = pd.DataFrame({
    'text': X_test.values,
    'sentiment_encoded': y_test.values,
    'sentiment': pd.Series(y_test.values).map({2: 'negative', 0: 'neutral', 1: 'positive'}).values
})

print("\nTest set prepared for LLM evaluation")
print(f"Note: Training set ({len(X_train)} samples) is not used for zero-shot LLM")


Test set prepared for LLM evaluation
Note: Training set (4064 samples) is not used for zero-shot LLM


## Note on Test Set Usage
We will evaluate Gemini on the exact same test set as traditional ML models for fair comparison

## Google Gemini Flash 2.5 Configuration

In [32]:
# Configure Google Gemini API
# IMPORTANT: Set your API key as an environment variable or replace with your key
# You can get your API key from: https://makersuite.google.com/app/apikey

# Option 1: Set environment variable (recommended)
# export GOOGLE_API_KEY='your-api-key-here'
api_key = os.environ.get('GOOGLE_API_KEY')

# Option 2: Direct assignment (not recommended for production)
# api_key = 'your-api-key-here'

if not api_key:
    print("‚ö†Ô∏è WARNING: GOOGLE_API_KEY not found!")
    print("Please set your API key:")
    print("  Option 1: Set environment variable GOOGLE_API_KEY")
    print("  Option 2: Uncomment and add your key in the cell above")
else:
    genai.configure(api_key=api_key)
    print("‚úì Google Gemini API configured successfully")

‚úì Google Gemini API configured successfully


In [33]:
# Initialize the model
model = genai.GenerativeModel('gemini-2.5-flash')

print("Model initialized: gemini-2.5-flash")

Model initialized: gemini-2.5-flash


## Sentiment Analysis with Gemini Flash 2.5

In [34]:
# Define the prompt template for sentiment analysis
def create_sentiment_prompt(text):
    """
    Create a prompt for Gemini to analyze sentiment of financial news.
    
    Args:
        text (str): Financial news text to analyze
        
    Returns:
        str: Formatted prompt for the LLM
    """
    prompt = f"""You are a financial sentiment analysis expert. Analyze the sentiment of the following financial news text and classify it into one of three categories: positive, negative, or neutral.

Financial News Text:
"{text}"

Instructions:
- Respond with ONLY ONE WORD: "positive", "negative", or "neutral"
- Consider the financial context and implications
- Do not provide explanations or additional text
- Your response must be exactly one of these three words in lowercase

Sentiment:"""
    
    return prompt

In [35]:
# Function to get sentiment prediction from Gemini
def predict_sentiment_gemini(text, model, max_retries=3):
    """
    Predict sentiment using Google Gemini Flash 2.5.
    
    Args:
        text (str): Input text for sentiment analysis
        model: Gemini model instance
        max_retries (int): Maximum number of retry attempts
        
    Returns:
        str: Predicted sentiment ('positive', 'negative', 'neutral') or 'error'
    """
    prompt = create_sentiment_prompt(text)
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            
            # Extract and clean the response
            sentiment = response.text.strip().lower()
            
            # Validate response
            if sentiment in ['positive', 'negative', 'neutral']:
                return sentiment
            else:
                # Try to extract valid sentiment from response
                if 'positive' in sentiment:
                    return 'positive'
                elif 'negative' in sentiment:
                    return 'negative'
                elif 'neutral' in sentiment:
                    return 'neutral'
                else:
                    print(f"Invalid response: {sentiment}, retrying...")
                    time.sleep(1)
                    
        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2)  # Wait before retry
            else:
                return 'error'
    
    return 'error'

## Batch Prediction on Sample Dataset

In [36]:
len(df_test)

1016

In [37]:
# Option to load previously saved predictions
import pickle
import os

predictions_file = '../data/gemini_predictions.pkl'

# Set this to True to load saved predictions, False to run new predictions
LOAD_SAVED_PREDICTIONS = False

if LOAD_SAVED_PREDICTIONS and os.path.exists(predictions_file):
    print(f"Loading saved predictions from: {predictions_file}")
    df_test = pd.read_pickle(predictions_file)
    print(f"‚úì Loaded {len(df_test)} predictions")
    print("\nTo run new predictions, set LOAD_SAVED_PREDICTIONS = False")
else:
    if LOAD_SAVED_PREDICTIONS and not os.path.exists(predictions_file):
        print(f"‚ö†Ô∏è Saved predictions file not found: {predictions_file}")
    print("Running new predictions...")

Running new predictions...


In [38]:
# Perform sentiment analysis on the entire test set (same as traditional ML evaluation)
print(f"Analyzing {len(df_test)} texts from test set with Gemini Flash 2.5...")

predictions = []
errors = 0

# Add progress bar
for idx, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Processing"):
    text = row['text']
    prediction = predict_sentiment_gemini(text, model)
    predictions.append(prediction)
    
    if prediction == 'error':
        errors += 1
    
    # Rate limiting: sleep briefly between requests to avoid hitting rate limits
    time.sleep(10)  # Adjust based on your API quota

# Add predictions to dataframe
df_test['gemini_prediction'] = predictions

print(f"\n‚úì Predictions completed!")
print(f"Total predictions: {len(predictions)}")
print(f"Errors: {errors}")
print(f"Success rate: {((len(predictions) - errors) / len(predictions)) * 100:.2f}%")

Analyzing 1016 texts from test set with Gemini Flash 2.5...


Processing:  12%|‚ñà‚ñè        | 117/1016 [24:07<2:52:25, 11.51s/it]

Error on attempt 1: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250, model: gemini-2.5-flash
Please retry in 11.758415857s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, retry_delay {
  seconds: 11
}
]
Error on attempt 2: 429 You exceeded your current quota, please check your pl

Processing:  12%|‚ñà‚ñè        | 118/1016 [24:22<3:05:23, 12.39s/it]

Error on attempt 1: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250, model: gemini-2.5-flash
Please retry in 57.326810201s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, retry_delay {
  seconds: 57
}
]
Error on attempt 2: 429 You exceeded your current quota, please check your pl

Processing:  12%|‚ñà‚ñè        | 119/1016 [24:36<3:14:20, 13.00s/it]

Error on attempt 1: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250, model: gemini-2.5-flash
Please retry in 42.896427128s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, retry_delay {
  seconds: 42
}
]
Error on attempt 2: 429 You exceeded your current quota, please check your pl

Processing:  12%|‚ñà‚ñè        | 120/1016 [24:50<3:20:37, 13.43s/it]

Error on attempt 1: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250, model: gemini-2.5-flash
Please retry in 28.443945819s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, retry_delay {
  seconds: 28
}
]
Error on attempt 2: 429 You exceeded your current quota, please check your pl

Processing:  12%|‚ñà‚ñè        | 120/1016 [25:05<3:07:20, 12.55s/it]



KeyboardInterrupt: 

In [None]:
# Save predictions to pickle file for future use
import pickle

predictions_file = '../data/gemini_predictions.pkl'
df_test.to_pickle(predictions_file)
print(f"\n‚úì Predictions saved to: {predictions_file}")
print(f"You can load this file later to skip the prediction step")

In [None]:
# Remove any rows with prediction errors (if any)
df_test_clean = df_test[df_test['gemini_prediction'] != 'error'].copy()
print(f"\nSuccessfully predicted: {len(df_test_clean)} out of {len(df_test)} samples")

if errors > 0:    print(f"‚ö†Ô∏è Warning: {errors} predictions failed")

In [None]:
# Encode Gemini predictions to numeric values for comparison
df_test_clean['gemini_prediction_encoded'] = df_test_clean['gemini_prediction'].apply(encode_sentiment)

# Map encoded values back to sentiment labels for display
df_test_clean['sentiment'] = df_test_clean['sentiment_encoded'].map({2: 'negative', 0: 'neutral', 1: 'positive'})


# Display sample resultsprint(df_test_clean[['text', 'sentiment', 'gemini_prediction']].head(10))
print("\nSample predictions:")

## Performance Evaluation

In [None]:
# Calculate evaluation metrics
y_true = df_test_clean['sentiment_encoded']
y_pred = df_test_clean['gemini_prediction_encoded']

accuracy = accuracy_score(y_true, y_pred)
f1_weighted = f1_score(y_true, y_pred, average='weighted')
f1_macro = f1_score(y_true, y_pred, average='macro')

print("="*70)
print("GOOGLE GEMINI FLASH 2.5 PERFORMANCE METRICS")
print("="*70)
print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"F1-Score (Weighted): {f1_weighted:.4f}")
print(f"F1-Score (Macro): {f1_macro:.4f}")
print(f"\nTest Samples: {len(df_test_clean)}")

print(f"Training Set Size: {len(X_train)} (not used for LLM)")print(f"Test Set Size: {len(X_test)}")

In [None]:
# Detailed classification report
print("\n" + "="*70)
print("CLASSIFICATION REPORT")
print("="*70)
print("\n")

target_names = ['neutral', 'positive', 'negative']
print(classification_report(y_true, y_pred, target_names=target_names))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, 
            annot=True, 
            fmt='d', 
            cmap='Blues',
            xticklabels=['neutral', 'positive', 'negative'],
            yticklabels=['neutral', 'positive', 'negative'])
plt.title('Confusion Matrix - Gemini Flash 2.5 Sentiment Analysis', fontsize=14, fontweight='bold')
plt.xlabel('Predicted Sentiment', fontsize=12)
plt.ylabel('True Sentiment', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Per-class accuracy
print("\n" + "="*70)
print("PER-CLASS ACCURACY")
print("="*70)

for i, label in enumerate(['neutral', 'positive', 'negative']):
    mask = y_true == i
    if mask.sum() > 0:
        class_acc = accuracy_score(y_true[mask], y_pred[mask])
        print(f"\n{label.capitalize()}:")
        print(f"  Accuracy: {class_acc:.4f} ({class_acc*100:.2f}%)")
        print(f"  Samples: {mask.sum()}")

## Prediction Distribution Analysis

In [None]:
# Compare actual vs predicted distributions
comparison_data = pd.DataFrame({
    'Category': ['Neutral', 'Positive', 'Negative'] * 2,
    'Count': [
        (y_true == 0).sum(), (y_true == 1).sum(), (y_true == 2).sum(),
        (y_pred == 0).sum(), (y_pred == 1).sum(), (y_pred == 2).sum()
    ],
    'Type': ['Actual'] * 3 + ['Predicted'] * 3
})

fig = px.bar(comparison_data, 
             x='Category', 
             y='Count',
             color='Type',
             barmode='group',
             title='Actual vs Predicted Sentiment Distribution',
             color_discrete_map={'Actual': '#3498db', 'Predicted': '#e74c3c'})

fig.update_layout(xaxis_title='Sentiment Category',
                  yaxis_title='Count',
                  title_x=0.5)
fig.show()

## Sample Predictions Review

In [None]:
# Show some correct predictions
print("="*70)
print("SAMPLE CORRECT PREDICTIONS")
print("="*70)

correct_predictions = df_test_clean[df_test_clean['sentiment_encoded'] == df_test_clean['gemini_prediction_encoded']]
print(f"\nShowing 5 random correct predictions (Total: {len(correct_predictions)}):\n")

for idx, row in correct_predictions.sample(min(5, len(correct_predictions))).iterrows():
    print(f"Text: {row['text'][:100]}...")
    print(f"True Sentiment: {row['sentiment']}")
    print(f"Predicted Sentiment: {row['gemini_prediction']}")
    print("-" * 70)

In [None]:
# Show some incorrect predictions
print("\n" + "="*70)
print("SAMPLE INCORRECT PREDICTIONS")
print("="*70)

incorrect_predictions = df_test_clean[df_test_clean['sentiment_encoded'] != df_test_clean['gemini_prediction_encoded']]
print(f"\nShowing 5 random incorrect predictions (Total: {len(incorrect_predictions)}):\n")

for idx, row in incorrect_predictions.sample(min(5, len(incorrect_predictions))).iterrows():
    print(f"Text: {row['text'][:100]}...")
    print(f"True Sentiment: {row['sentiment']}")
    print(f"Predicted Sentiment: {row['gemini_prediction']}")
    print("-" * 70)

## Save Results

In [None]:
# Save predictions to file
output_path = '../data/gemini_sentiment_predictions.csv'
df_test_clean.to_csv(output_path, index=False)
print(f"‚úì Results saved to: {output_path}")

In [None]:
# Summary statistics
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)
print(f"\nüìä Dataset Information:")
print(f"   ‚Ä¢ Original dataset size: {len(df)}")
print(f"   ‚Ä¢ Training set size: {len(X_train)} (not used for zero-shot LLM)")
print(f"   ‚Ä¢ Test set size: {len(X_test)}")
print(f"   ‚Ä¢ Test samples analyzed: {len(df_test_clean)}")
print(f"   ‚Ä¢ Prediction errors: {errors}")
print(f"   ‚Ä¢ Success rate: {((len(predictions) - errors) / len(predictions)) * 100:.2f}%")

print(f"\nüéØ Performance Metrics (on Test Set):")
print(f"   ‚Ä¢ Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"   ‚Ä¢ F1-Score (Weighted): {f1_weighted:.4f}")
print(f"   ‚Ä¢ F1-Score (Macro): {f1_macro:.4f}")

print(f"\n‚úÖ Correct Predictions: {len(correct_predictions)} ({len(correct_predictions)/len(df_test_clean)*100:.2f}%)")

print(f"‚ùå Incorrect Predictions: {len(incorrect_predictions)} ({len(incorrect_predictions)/len(df_test_clean)*100:.2f}%)")print("\n" + "="*70)

print("   The model was not trained on the training set.")
print("\nüí° Note: This is a zero-shot evaluation using Gemini Flash 2.5")