In [4]:
import pandas as pd
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize and Update Lexicon
nltk.download('vader_lexicon', quiet=True)
analyzer = SentimentIntensityAnalyzer()

# CUSTOM TUNING: Tell the AI that these SaaS-specific terms are negative
new_words = {
    'increased': -2.0, # In support, price 'increased' is usually bad
    'limit': 0.0,      # Ensure 'limit' doesn't sway positive
    'pricing': -1.5,   # Focus on cost complaints
}
analyzer.lexicon.update(new_words)

def process_sentiment():
    data_dir = r"C:\Users\uduok\git_folder\SaaS_Churn_Prediction\data"
    df = pd.read_csv(os.path.join(data_dir, 'fact_support_tickets.csv'))

    # calculate Sentiment with tuned Lexicon
    df['sentiment_score'] = df['ticket_text'].apply(
        lambda x: analyzer.polarity_scores(str(x))['compound']
    )

    # Align Priority with Sentiment
    # If sentiment is very negative (< -0.3), force High Priority
    df.loc[df['sentiment_score'] < -0.3, 'priority'] = 'High'
    # If sentiment is positive or neutral, force Low Priority
    df.loc[df['sentiment_score'] >= -0.1, 'priority'] = 'Low'

    # 4. Tighten Neutral Labels
    def label_sentiment(score):
        if score > 0.2: return 'Positive'   # Higher bar for positive
        elif score < -0.1: return 'Negative' # More sensitive to negative
        else: return 'Neutral'

    df['sentiment_label'] = df['sentiment_score'].apply(label_sentiment)

    # Remove duplicates per customer
    # Ensure a customer doesn't send the exact same praise 4 times
    df = df.drop_duplicates(subset=['customer_id', 'ticket_text'])

    df.to_csv(os.path.join(data_dir, 'fact_support_tickets_scored.csv'), index=False)
    print("Tuned Sentiment Analysis Complete: Duplicates removed and Priority aligned.")

if __name__ == "__main__":
    process_sentiment()

Tuned Sentiment Analysis Complete: Duplicates removed and Priority aligned.


In [None]:
import pandas as pd
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize
nltk.download('vader_lexicon', quiet=True)
analyzer = SentimentIntensityAnalyzer()

# 1. DOMAIN TUNING: Manually set weights for SaaS "Churn Signals"
saas_overrides = {
    'pricing': -2.0, 'costing': -2.0, 'increased': -1.5,
    'revenue': -1.0, 'sluggish': -2.5, 'broken': -3.0,
    'unusable': -3.5, 'questioning': -1.5, 'failed': -2.5,
    'limit': 0.0, 'calls': 0.0  # Force technical questions to neutral
}
analyzer.lexicon.update(saas_overrides)

def process_sentiment():
    data_dir = r"C:\Users\uduok\git_folder\SaaS_Churn_Prediction\data"
    input_path = os.path.join(data_dir, 'fact_support_tickets.csv')
    
    if not os.path.exists(input_path):
        print("Error: Run the data generator script first.")
        return

    df = pd.read_csv(input_path)

    # 2. ENHANCED SCORING LOGIC
    def get_tuned_score(text):
        text_lower = text.lower()
        score = analyzer.polarity_scores(text)['compound']
        
        # Hard Overrides for specific complaints VADER misses
        if "pricing" in text_lower or "costing us" in text_lower:
            return min(score, -0.6) # Force a strong negative
        if "question" in text_lower or "limit" in text_lower:
            return 0.0 # Force pure neutrality for queries
        return score

    df['sentiment_score'] = df['ticket_text'].apply(get_tuned_score)

    # 3. LOGICAL LABELING (Tighter Thresholds)
    def label_sentiment(score):
        if score >= 0.3: return 'Positive'   # Higher bar for praise
        elif score <= -0.1: return 'Negative' # More sensitive to complaints
        else: return 'Neutral'

    df['sentiment_label'] = df['sentiment_score'].apply(label_sentiment)

    # 4. FIXING PRIORITY: Align with Sentiment
    # Logic: If it's Negative, it's High Priority. If it's Positive/Neutral, it's Low.
    df['priority'] = df['sentiment_label'].apply(
        lambda x: 'High' if x == 'Negative' else 'Low'
    )

    # 5. DEDUPLICATION: Remove the "4x duplicate" issue
    df = df.drop_duplicates(subset=['customer_id', 'ticket_text'])

    # Export
    output_path = os.path.join(data_dir, 'fact_support_tickets_scored.csv')
    df.to_csv(output_path, index=False)
    
    print("-" * 30)
    print("SUCCESS: Tuned Sentiment Analysis Complete.")
    print("Logic: Overrides applied for Pricing/Revenue complaints.")
    print("Logic: Priority now synced to Sentiment (Negative = High).")
    print("-" * 30)

if __name__ == "__main__":
    process_sentiment()