In [None]:
"""
sentiment_analysis.py

Sentiment analysis using:
 - TextBlob (polarity)
 - VADER (compound score)
 - Transformers (DistilBERT fine-tuned on SST-2)

Features:
 - Text cleaning (lowercase, remove punctuation & numbers)
 - Compute scores from all three methods (numeric)
 - Map transformer labels to numeric scores
 - Visualize distributions and save results

Run:
 1) Install dependencies:
    pip install pandas numpy matplotlib seaborn textblob vaderSentiment transformers torch nltk

 2) For VADER lexicon and TextBlob corpora (run once):
    python -c "import nltk; nltk.download('vader_lexicon')"
    python -m textblob.download_corpora

 3) Run the script:
    python sentiment_analysis.py
"""

# --- Imports ---
import re                       # regular expressions, used for text cleaning
import pandas as pd             # tabular data handling
import numpy as np              # numeric operations
import matplotlib.pyplot as plt # plotting
import seaborn as sns           # nicer statistical plots
import warnings                 # to show or ignore warnings

# NLP libraries
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# transformers pipeline for DistilBERT. We'll try to import it,
# but if it's not available or model download fails we will gracefully skip it.
try:
    from transformers import pipeline
    TRANSFORMERS_AVAILABLE = True
except Exception:
    TRANSFORMERS_AVAILABLE = False

warnings.filterwarnings("ignore")  # keep the output clean for educational runs

# --- Preprocessing function ---
def clean_text(text: str) -> str:
    """
    Clean the input text:
     - convert to string (protect against NaN)
     - lowercase
     - remove URLs
     - remove punctuation and numbers (keep only letters and spaces)
     - collapse multiple spaces
     - strip leading/trailing spaces
    """
    if pd.isna(text):
        return ""
    # ensure it's a string
    s = str(text)
    # lowercase
    s = s.lower()
    # remove URLs (common in social media)
    s = re.sub(r'http\S+|www\.\S+', '', s)
    # remove non-letter characters (this removes punctuation and numbers)
    # [^a-z\s] -> anything that's not a lowercase letter or space
    s = re.sub(r'[^a-z\s]', ' ', s)
    # collapse multiple spaces to single space
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# --- Scoring wrappers ---
# TextBlob returns polarity in [-1, 1] where negative -> negative sentiment,
# positive -> positive sentiment and 0 -> neutral.
def textblob_score(text: str) -> float:
    tb = TextBlob(text)
    return tb.sentiment.polarity  # float in [-1.0, +1.0]

# VADER returns a dict of scores; 'compound' is a normalized, weighted composite in [-1, 1].
vader_analyzer = SentimentIntensityAnalyzer()
def vader_compound_score(text: str) -> float:
    return vader_analyzer.polarity_scores(text)["compound"]

# DistilBERT pipeline returns {'label': 'POSITIVE'/'NEGATIVE', 'score': prob}
# We'll map it to a numeric in [-1, 1] by:
#   numeric = score  if label == POSITIVE (range [0.5,1.0] typically)
#   numeric = -score if label == NEGATIVE
# This preserves both direction and confidence.
distilbert_pipe = None
if TRANSFORMERS_AVAILABLE:
    try:
        # model is 'distilbert-base-uncased-finetuned-sst-2-english' by default for sentiment-analysis
        distilbert_pipe = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    except Exception as e:
        # e.g., no internet or torch not available; we'll continue but skip transformer step later
        print("Warning: transformers pipeline could not be initialized. DistilBERT will be skipped.")
        distilbert_pipe = None

def distilbert_score(text: str) -> float:
    """
    Returns a numeric score in [-1, 1] derived from the transformer label+score.
    If transformer pipeline isn't available, returns np.nan.
    """
    if distilbert_pipe is None:
        return np.nan
    # run the pipeline on the single text
    res = distilbert_pipe(text[:512])  # limit to 512 chars/tokens to be safe
    # pipeline returns a list of dicts for single input
    if isinstance(res, list) and len(res) > 0:
        r = res[0]
        label = r.get("label", "")
        score = float(r.get("score", 0.0))
        if label.upper().startswith("POS"):
            return score  # positive mapped to +score
        else:
            return -score  # negative mapped to -score
    return np.nan

# --- Label helpers (human-readable categories) ---
def label_from_score(score: float, method: str = "generic") -> str:
    """
    Map numeric score to 'positive' / 'negative' / 'neutral' with reasonable thresholds.
    method:
      - "vader": uses VADER common thresholds (+0.05, -0.05)
      - "generic": uses 0 as boundary (<=0 negative, ==0 neutral, >0 positive)
    """
    if pd.isna(score):
        return "unknown"
    if method == "vader":
        if score >= 0.05:
            return "positive"
        elif score <= -0.05:
            return "negative"
        else:
            return "neutral"
    else:
        # generic: treat exactly 0 as neutral
        if score > 0:
            return "positive"
        elif score < 0:
            return "negative"
        else:
            return "neutral"

# --- Main pipeline function ---
def analyze_texts(texts):
    """
    texts: list-like of raw text strings
    returns: pandas.DataFrame with original, cleaned, 3 scores, labels, average
    """
    df = pd.DataFrame({"text": list(texts)})
    # clean the text
    df["clean_text"] = df["text"].apply(clean_text)

    # compute TextBlob polarity
    df["textblob_score"] = df["clean_text"].apply(textblob_score)

    # compute VADER compound
    df["vader_compound"] = df["clean_text"].apply(vader_compound_score)

    # compute DistilBERT numeric score if available
    if distilbert_pipe is not None:
        # batching is more efficient than per-row calls; pipeline supports list input.
        clean_texts = df["clean_text"].tolist()
        # run pipeline on the batch
        try:
            results = distilbert_pipe(clean_texts, truncation=True)
            # results is a list of dicts: map them to numeric scores
            df["distilbert_label"] = [r.get("label", "") for r in results]
            df["distilbert_prob"] = [r.get("score", np.nan) for r in results]
            df["distilbert_score"] = [
                (prob if lbl.upper().startswith("POS") else -prob) if (isinstance(prob, (int, float)) and lbl) else np.nan
                for lbl, prob in zip(df["distilbert_label"], df["distilbert_prob"])
            ]
        except Exception:
            # If batch fails, fall back to per-row safe call (slower)
            df["distilbert_score"] = df["clean_text"].apply(distilbert_score)
            df["distilbert_label"] = df["clean_text"].apply(
                lambda t: distilbert_pipe(t[:512])[0]["label"] if distilbert_pipe is not None else "unknown")
            df["distilbert_prob"] = df["clean_text"].apply(
                lambda t: distilbert_pipe(t[:512])[0]["score"] if distilbert_pipe is not None else np.nan)
    else:
        # mark these columns as missing
        df["distilbert_score"] = np.nan
        df["distilbert_label"] = "skipped"
        df["distilbert_prob"] = np.nan

    # human-readable labels
    df["textblob_label"] = df["textblob_score"].apply(lambda s: label_from_score(s, method="generic"))
    df["vader_label"] = df["vader_compound"].apply(lambda s: label_from_score(s, method="vader"))
    df["distilbert_label_readable"] = df["distilbert_score"].apply(lambda s: label_from_score(s, method="generic")
                                                                    if not pd.isna(s) else "unknown")

    # average numeric score across available methods (skip nan)
    df["avg_sentiment"] = df[["textblob_score", "vader_compound", "distilbert_score"]].mean(axis=1, skipna=True)

    return df

# --- Visualization helpers ---
def plot_distributions(df):
    """
    Several small plots to compare how the three methods distribute labels & scores.
    """
    sns.set(style="whitegrid")
    plt.figure(figsize=(14, 10))

    # 1) Count plot: TextBlob labels
    plt.subplot(2, 2, 1)
    sns.countplot(x="textblob_label", data=df, order=["positive", "neutral", "negative"])
    plt.title("TextBlob label counts")

    # 2) Count plot: VADER labels
    plt.subplot(2, 2, 2)
    sns.countplot(x="vader_label", data=df, order=["positive", "neutral", "negative"])
    plt.title("VADER label counts")

    # 3) Bar plot: DistilBERT label counts (if available)
    plt.subplot(2, 2, 3)
    if "distilbert_label" in df.columns and df["distilbert_label"].nunique() > 0:
        sns.countplot(x="distilbert_label", data=df)
        plt.title("DistilBERT label counts (raw)")
    else:
        plt.text(0.5, 0.5, "DistilBERT skipped or unavailable", ha='center', va='center')
        plt.title("DistilBERT")

    # 4) Histogram: average sentiment numeric
    plt.subplot(2, 2, 4)
    sns.histplot(df["avg_sentiment"].dropna(), kde=True, bins=20)
    plt.title("Histogram of average sentiment score")

    plt.tight_layout()
    plt.show()

def correlation_report(df):
    """
    Print correlations between the numeric scores of the three methods.
    Useful to know how aligned they are.
    """
    cols = ["textblob_score", "vader_compound", "distilbert_score"]
    present = [c for c in cols if c in df.columns]
    if len(present) < 2:
        print("Not enough score columns to compute correlation.")
        return
    print("\nCorrelation matrix between numeric scores:")
    print(df[present].corr())

# --- Example / main execution ---
if __name__ == "__main__":
    # Example texts - replace these with reading a CSV file if you have one
    sample_texts = [
        "I love this product! It's fantastic and works amazingly well.",
        "This is the worst service I've ever experienced. Totally unacceptable.",
        "Not bad, but could be better.",
        "I'm not sure how I feel about this.",
        "Absolutely brilliant performance by the cast!",
        "The item arrived broken. Very disappointed.",
        ""  # empty string edge case
    ]

    # Or uncomment to read from a CSV file with a 'text' column:
    # df_input = pd.read_csv("your_file.csv")  # ensure 'text' column exists
    # sample_texts = df_input['text'].tolist()

    print("Cleaning + analyzing texts...")
    results_df = analyze_texts(sample_texts)

    # show results
    pd.set_option('display.max_columns', None)
    print("\nResults (first rows):")
    print(results_df.head(20))

    # save results
    results_df.to_csv("sentiment_results.csv", index=False)
    print("\nSaved results to sentiment_results.csv")

    # visualizations
    plot_distributions(results_df)

    # correlation
    correlation_report(results_df)

    print("\nDone. If DistilBERT was skipped, install torch and transformers, then rerun.")
