In [3]:
import os
import re
import json
import pandas as pd
import ollama
from tqdm import tqdm
# Import scikit-learn metrics
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score # Can use this explicitly too
)
# matplotlib is no longer needed as ROC curve is removed
# import matplotlib.pyplot as plt

# Make sure the Ollama server is running with the specified model available.
# Example: ollama run gemma:2b

# --- Configuration ---
OLLAMA_MODEL = 'gemma3:4b' # <<< ADJUST your Ollama model name here if needed
INPUT_CSV = "../data/test.csv" # <<< Path to your test CSV (must have 'text' and 'labels' columns)
OUTPUT_DIR = "classification_results"
RESULTS_FILENAME = "classification_evaluation"

# -- Context prompt for classification (No confidence score) --
context_prompt = (
    '''
    I am a researcher interested in studying disaster response events. My goal is to analyze historical tweets to better understand responses to various types of disasters.

Each tweet I receive includes a disaster-related keyword or hashtag, but not all tweets describe an actual disaster event. I need a model to classify tweets into two categories based on whether they are truly disaster-related:
1 - Disaster-Related (tweets that report actual emergencies, accidents, crises, or factual news about such events requiring attention).
0 - Not Disaster-Related (tweets that use disaster-related terms metaphorically, humorously, or in other non-emergency contexts).

Consider these guidelines for classification:
- Tweets reporting actual emergencies, accidents, or crises (such as natural disasters, transportation accidents, industrial incidents, or health emergencies) should be classified as 1.
- Factual news reports about any actual disaster or accident should be classified as 1.
- Tweets using disaster-related terminology metaphorically, humorously, or in a non-urgent manner should be classified as 0.
- Jokes, memes, or casual mentions that do not indicate real-life emergencies should be classified as 0.

Please return the classification (0 or 1) followed by a brief reasoning for your decision. These two parts—classification and reasoning—should be returned in one line, separated by a single semicolon (;).

For example:
1; This tweet mentions rescue efforts and evacuation details following a reported flood.

Please return your answer in this exact format:
classification; reasoning
where classification is either 0 or 1, and reasoning is a short explanation.

Important: When you write the reasoning, do not use any semicolons (;) within the reasoning text itself, as your answer will be processed by a script expecting only one semicolon separator in the entire line.

'''
)

# --- Helper Functions ---
def fix_tweet_text(tweet: str) -> str:
    """Clean up tweets by removing extra whitespace."""
    # Ensure tweet is a string before processing
    if not isinstance(tweet, str):
        tweet = str(tweet)
    return re.sub(r'\s+', ' ', tweet).strip()

def classify_tweet(tweet: str, model_name: str) -> str:
    """Call the Ollama model and return the raw response string."""
    full_prompt = f"{context_prompt}\nTweet:\n{tweet}"
    try:
        resp = ollama.chat(
            model=model_name,
            messages=[{'role': 'user', 'content': full_prompt}]
            # Consider adding options like temperature if needed:
            # options={'temperature': 0.0} # For more deterministic output
        )
        # Ensure the response content exists and is a string
        message_content = resp.get('message', {}).get('content', '')
        if not isinstance(message_content, str):
             return f"error; Invalid response format from Ollama: {type(message_content)}"
        return message_content
    except Exception as e:
        print(f"\nError calling Ollama model '{model_name}': {e}")
        # Return an error string that somewhat matches expected format but indicates failure
        return f"error; Ollama API call failed: {e}"

# --- Main Execution ---
def main():
    # 1. Load your data
    try:
        test_df = pd.read_csv(INPUT_CSV)
        # Ensure required columns exist
        if 'text' not in test_df.columns or 'labels' not in test_df.columns:
            raise ValueError("CSV must contain 'text' and 'labels' columns.")
        print(f"Loaded {len(test_df)} tweets from {INPUT_CSV}")
    except FileNotFoundError:
        print(f"Error: Input file not found at {INPUT_CSV}")
        return
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return

    # 2. Prepare output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # 3. Process tweets and collect results
    results_data = [] # Store detailed results per tweet
    all_actual_labels = []
    all_predicted_labels = []
    # Confidence scores are no longer collected from the model

    print(f"\n=== Classifying tweets using Ollama model: {OLLAMA_MODEL} ===")
    # Progress bar over tweets
    for _, row in tqdm(
            test_df.iterrows(),
            total=len(test_df),
            desc="Classifying",
            unit="tweet"
    ):
        tweet = row['text']
        # Ensure actual label is integer 0 or 1
        try:
            actual_label = int(row['labels'])
            if actual_label not in [0, 1]:
                 raise ValueError("Actual label must be 0 or 1")
        except (ValueError, TypeError) as e:
            print(f"\nSkipping row due to invalid actual label: {row['labels']} ({e}). Tweet: {str(tweet)[:50]}...")
            results_data.append({
                "tweet": fix_tweet_text(tweet),
                "predicted_label": "error",
                # "confidence": None, # Confidence no longer provided
                "reasoning": "",
                "actual_label": row['labels'], # Keep original problematic label
                "raw_response": "",
                "error": f"Invalid actual label: {e}"
            })
            continue # Skip this row

        tweet_clean = fix_tweet_text(tweet)
        raw_resp = classify_tweet(tweet_clean, OLLAMA_MODEL)

        # Parse the response (expecting "classification; reasoning")
        pred_label_parsed = "error"
        # confidence_parsed = None # Confidence no longer provided
        reasoning_parsed = ""
        error_msg = None

        try:
            # Handle potential None or non-string responses explicitly
            if not isinstance(raw_resp, str):
                 raise TypeError(f"Received non-string response from classify_tweet: {type(raw_resp)}")

            parts = raw_resp.split(";")
            # Be more robust: handle potential extra/missing whitespace
            parts = [p.strip() for p in parts]

            # --- PARSING LOGIC UPDATED ---
            if len(parts) != 2: # Expect exactly 2 parts now
                 # Check if it might be an error message returned from classify_tweet
                 if parts[0].lower() == 'error':
                      raise ValueError(f"Ollama API or internal error: {parts[1] if len(parts)>1 else raw_resp}")
                 else:
                      raise ValueError(f"Expected 2 parts separated by ';' but got {len(parts)}. Response: '{raw_resp}'")

            pred_label_parsed = int(parts[0])
            if pred_label_parsed not in [0, 1]:
                raise ValueError(f"Predicted label must be 0 or 1, but got {pred_label_parsed}")

            reasoning_parsed = parts[1] # Reasoning is the second part
            # --- END PARSING LOGIC UPDATE ---

            # If parsing successful, add to lists for sklearn metrics
            all_actual_labels.append(actual_label)
            all_predicted_labels.append(pred_label_parsed)
            # No confidence score to append

        except Exception as e:
            error_msg = f"Parsing or processing error: {e}"
            # Do not add to sklearn lists if parsing failed
            print(f"\nWarning: Could not parse response or error occurred for tweet: {tweet_clean[:50]}... | Response: '{raw_resp}' | Error: {e}")

        # Append detailed result regardless of parsing success
        results_data.append({
            "tweet": tweet_clean,
            "predicted_label": pred_label_parsed,
            # "confidence": confidence_parsed, # No longer applicable
            "reasoning": reasoning_parsed,
            "actual_label": actual_label,
            "raw_response": raw_resp,
            "error": error_msg # None if parsing succeeded and no other error
        })

    # 4. Save detailed results
    results_df = pd.DataFrame(results_data)
    # Add confidence column back with None values if needed for schema consistency downstream
    # results_df['confidence'] = None # Optional, depends if you need the column
    csv_path = os.path.join(OUTPUT_DIR, f"{RESULTS_FILENAME}_details.csv")
    json_path = os.path.join(OUTPUT_DIR, f"{RESULTS_FILENAME}_details.json")
    try:
        results_df.to_csv(csv_path, index=False)
        results_df.to_json(json_path, orient='records', indent=2)
        print(f"\nDetailed results saved to {csv_path} and {json_path}")
    except Exception as e:
        print(f"Error saving detailed results: {e}")


    # 5. Calculate and Print Metrics (only if there are valid predictions)
    print("\n=== Classification Metrics ===")
    if not all_actual_labels:
        print("No valid predictions were successfully parsed. Cannot calculate metrics.")
        return

    # Ensure labels are integers for sklearn functions
    try:
        all_actual_labels_int = [int(l) for l in all_actual_labels]
        all_predicted_labels_int = [int(l) for l in all_predicted_labels]
    except ValueError:
        print("Error: Could not convert all labels to integers for metric calculation.")
        return

    # -- Confusion Matrix --
    print("\nConfusion Matrix:")
    try:
        cm = confusion_matrix(all_actual_labels_int, all_predicted_labels_int)
        # Handle case where CM might not be 2x2 if only one class was predicted/present
        if cm.shape == (1, 1):
             print(f"Only one class present/predicted: {cm}")
             # Manually create a 2x2 matrix for consistent display if needed
             if all_actual_labels_int[0] == 0: # Only class 0
                 cm_display = pd.DataFrame([[cm[0,0], 0], [0, 0]], index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
             else: # Only class 1
                 cm_display = pd.DataFrame([[0, 0], [0, cm[0,0]]], index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
             print(cm_display)
        elif cm.shape == (2,2):
             print(pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']))
        else:
             print(f"Unexpected confusion matrix shape: {cm.shape}")
             print(cm)
    except Exception as e:
        print(f"Error calculating/displaying Confusion Matrix: {e}")


    # -- Classification Report --
    print("\nClassification Report:")
    try:
        report = classification_report(
            all_actual_labels_int,
            all_predicted_labels_int,
            target_names=['Not Disaster-Related (0)', 'Disaster-Related (1)'],
            digits=4, # Show more precision
            zero_division=0 # Set how to handle zero division (e.g., for precision/recall when no predictions/actuals for a class)
        )
        print(report)
    except Exception as e:
        print(f"Error calculating Classification Report: {e}")

    # -- ROC AUC Score and Curve are REMOVED as confidence score is not available --

    print(f"\nEvaluation complete. Check '{OUTPUT_DIR}/' for detailed results.")

if __name__ == "__main__":
    main()

Loaded 1523 tweets from ../data/test.csv

=== Classifying tweets using Ollama model: gemma3:4b ===


Classifying:  51%|████████████████████████████████▉                               | 784/1523 [03:28<03:07,  3.94tweet/s]




Classifying: 100%|███████████████████████████████████████████████████████████████| 1523/1523 [06:42<00:00,  3.78tweet/s]


Detailed results saved to classification_results/classification_evaluation_details.csv and classification_results/classification_evaluation_details.json

=== Classification Metrics ===

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          600          273
Actual 1          115          534

Classification Report:
                          precision    recall  f1-score   support

Not Disaster-Related (0)     0.8392    0.6873    0.7557       873
    Disaster-Related (1)     0.6617    0.8228    0.7335       649

                accuracy                         0.7451      1522
               macro avg     0.7504    0.7550    0.7446      1522
            weighted avg     0.7635    0.7451    0.7462      1522


Evaluation complete. Check 'classification_results/' for detailed results.



