In [1]:

import pandas as pd
import requests
import json
import time
from typing import List, Dict, Any
import re
from collections import Counter
import numpy as np

class WalmartTopicExtractor:
    """
    Complete topic extraction system using CREST framework for Walmart complaints
    """
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def clean_complaints(self, complaints: List[str]) -> List[str]:
        """Clean and filter complaints"""
        cleaned = []
        for complaint in complaints:
            if isinstance(complaint, str) and len(complaint.strip()) > 10:
                # Basic cleaning
                clean_text = re.sub(r'\s+', ' ', complaint.strip())
                cleaned.append(clean_text)
        return cleaned
    
    def create_crest_prompt(self, complaints: List[str]) -> str:
        """Create comprehensive CREST framework prompt"""
        complaints_text = "\n".join([f"{i+1}. {complaint}" for i, complaint in enumerate(complaints)])
        
        return f"""
You are an expert retail analyst. Analyze these Walmart customer complaints using the CREST framework:

COMPLAINTS:
{complaints_text}

Apply the CREST Framework:

**C - CATEGORIZE**: Identify main complaint categories
**R - RELATE**: Find patterns and relationships between complaints  
**E - EXTRACT**: Extract specific topics, themes, and sentiment
**S - SYNTHESIZE**: Combine findings into overarching themes
**T - TRANSFORM**: Provide actionable insights

Please provide analysis in this JSON format:

{{
  "categorize": {{
    "categories": [
      {{
        "category": "Customer Service",
        "complaints": [1, 2],
        "frequency": 2,
        "description": "Issues with staff behavior and service quality"
      }}
    ]
  }},
  "relate": {{
    "patterns": [
      {{
        "pattern": "Staff-related issues",
        "complaints": [1, 3],
        "connection": "Multiple complaints about employee behavior"
      }}
    ],
    "root_causes": [
      {{
        "cause": "Understaffing",
        "manifestations": ["long wait times", "poor service"],
        "affected_complaints": [1, 2]
      }}
    ]
  }},
  "extract": {{
    "topics": [
      {{
        "topic": "Checkout wait times",
        "frequency": 3,
        "sentiment": "negative",
        "intensity": "high",
        "keywords": ["long lines", "wait", "checkout"]
      }}
    ],
    "sentiment_summary": {{
      "overall_sentiment": "negative",
      "positive_mentions": 1,
      "negative_mentions": 8,
      "neutral_mentions": 1
    }}
  }},
  "synthesize": {{
    "main_themes": [
      {{
        "theme": "Operational Efficiency",
        "sub_themes": ["staffing", "checkout process", "store organization"],
        "impact_score": 8,
        "prevalence": "70%"
      }}
    ],
    "priority_issues": [
      {{
        "issue": "Long checkout lines",
        "impact": "high",
        "frequency": "high",
        "priority_score": 9
      }}
    ]
  }},
  "transform": {{
    "recommendations": [
      {{
        "action": "Increase checkout staff during peak hours",
        "impact": "high",
        "complexity": "medium",
        "timeline": "immediate"
      }}
    ],
    "metrics": [
      {{
        "metric": "Average checkout wait time",
        "target": "< 5 minutes",
        "measurement": "weekly"
      }}
    ]
  }}
}}
"""

    def call_llm(self, prompt: str, max_retries: int = 3) -> Dict[str, Any]:
        """Make API call to OpenRouter"""
        payload = {
            "model": "anthropic/claude-3.5-sonnet",
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.3,
            "max_tokens": 4000
        }
        
        for attempt in range(max_retries):
            try:
                response = requests.post(
                    self.base_url,
                    headers=self.headers,
                    json=payload,
                    timeout=60
                )
                
                if response.status_code == 200:
                    result = response.json()
                    content = result['choices'][0]['message']['content']
                    
                    # Try to extract JSON from response
                    try:
                        # Find JSON in the response
                        json_start = content.find('{')
                        json_end = content.rfind('}') + 1
                        if json_start != -1 and json_end != -1:
                            json_content = content[json_start:json_end]
                            parsed_json = json.loads(json_content)
                            return {"success": True, "data": parsed_json, "raw_content": content}
                    except:
                        pass
                    
                    return {"success": True, "data": None, "raw_content": content}
                
                else:
                    print(f"API Error: {response.status_code} - {response.text}")
                    
            except Exception as e:
                print(f"Request failed: {str(e)}")
                
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
        
        return {"success": False, "error": "Failed after retries"}

    def analyze_complaints_batch(self, complaints: List[str], batch_size: int = 10) -> List[Dict]:
        """Analyze complaints in batches"""
        results = []
        
        for i in range(0, len(complaints), batch_size):
            batch = complaints[i:i+batch_size]
            print(f"Processing batch {i//batch_size + 1} ({len(batch)} complaints)...")
            
            prompt = self.create_crest_prompt(batch)
            result = self.call_llm(prompt)
            
            if result["success"]:
                batch_result = {
                    "batch_number": i//batch_size + 1,
                    "complaints": batch,
                    "analysis": result.get("data"),
                    "raw_response": result.get("raw_content")
                }
                results.append(batch_result)
            else:
                print(f"Failed to analyze batch {i//batch_size + 1}")
            
            # Rate limiting
            time.sleep(1)
        
        return results

    def extract_topics_from_dataframe(self, df: pd.DataFrame, complaint_column: str = 'walmart_complaint') -> Dict[str, Any]:
        """Main function to extract topics from DataFrame"""
        print("Starting Walmart complaint analysis...")
        
        # Clean and prepare data
        complaints = df[complaint_column].dropna().tolist()
        cleaned_complaints = self.clean_complaints(complaints)
        
        print(f"Processing {len(cleaned_complaints)} complaints...")
        
        # Analyze in batches
        batch_results = self.analyze_complaints_batch(cleaned_complaints, batch_size=8)
        
        # Aggregate results
        aggregated_results = self.aggregate_batch_results(batch_results)
        
        return {
            "total_complaints": len(cleaned_complaints),
            "batches_processed": len(batch_results),
            "aggregated_analysis": aggregated_results,
            "batch_details": batch_results
        }

    def aggregate_batch_results(self, batch_results: List[Dict]) -> Dict[str, Any]:
        """Aggregate results from multiple batches"""
        all_categories = []
        all_topics = []
        all_themes = []
        all_recommendations = []
        
        for batch in batch_results:
            if batch.get("analysis"):
                analysis = batch["analysis"]
                
                # Aggregate categories
                if "categorize" in analysis and "categories" in analysis["categorize"]:
                    all_categories.extend(analysis["categorize"]["categories"])
                
                # Aggregate topics
                if "extract" in analysis and "topics" in analysis["extract"]:
                    all_topics.extend(analysis["extract"]["topics"])
                
                # Aggregate themes
                if "synthesize" in analysis and "main_themes" in analysis["synthesize"]:
                    all_themes.extend(analysis["synthesize"]["main_themes"])
                
                # Aggregate recommendations
                if "transform" in analysis and "recommendations" in analysis["transform"]:
                    all_recommendations.extend(analysis["transform"]["recommendations"])
        
        # Count and summarize
        category_counts = Counter([cat.get("category", "Unknown") for cat in all_categories])
        topic_counts = Counter([topic.get("topic", "Unknown") for topic in all_topics])
        
        return {
            "category_summary": dict(category_counts),
            "topic_summary": dict(topic_counts),
            "total_categories": len(set(category_counts.keys())),
            "total_topics": len(set(topic_counts.keys())),
            "all_themes": all_themes,
            "all_recommendations": all_recommendations,
            "top_categories": category_counts.most_common(5),
            "top_topics": topic_counts.most_common(10)
        }

    def save_results(self, results: Dict[str, Any], filename: str = "walmart_topic_analysis.json"):
        """Save results to JSON file"""
        with open(filename, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"Results saved to {filename}")

    def create_summary_report(self, results: Dict[str, Any]) -> str:
        """Create a human-readable summary report"""
        agg = results["aggregated_analysis"]
        
        report = f"""
WALMART COMPLAINT ANALYSIS REPORT
================================

OVERVIEW:
- Total Complaints Analyzed: {results['total_complaints']}
- Processing Batches: {results['batches_processed']}
- Categories Identified: {agg['total_categories']}
- Topics Identified: {agg['total_topics']}

TOP COMPLAINT CATEGORIES:
"""
        
        for category, count in agg['top_categories']:
            report += f"- {category}: {count} complaints\n"
        
        report += "\nTOP TOPICS:\n"
        for topic, count in agg['top_topics']:
            report += f"- {topic}: {count} mentions\n"
        
        report += f"\nKEY RECOMMENDATIONS:\n"
        for i, rec in enumerate(agg['all_recommendations'][:5], 1):
            if isinstance(rec, dict):
                action = rec.get('action', 'No action specified')
                report += f"{i}. {action}\n"
        
        return report


# MAIN EXECUTION CODE
def main():
    # Your API key
    API_KEY = "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"
    
    # Initialize extractor
    extractor = WalmartTopicExtractor(API_KEY)
    
    # Load your data (replace with your actual file path)
    # df = pd.read_csv('your_walmart_data.csv')
    
    # For demo purposes, create sample data
    sample_data = {
        'session_id': [1, 2, 3, 4, 5],
        'walmart_complaint': [
            "The checkout lines are always too long and there's never enough cashiers",
            "Customer service is terrible, staff are rude and unhelpful",
            "Store is always messy and items are never where they should be",
            "Prices keep going up but quality is going down",
            "Self-checkout machines are always broken and there's no help"
        ]
    }
    df = pd.DataFrame(sample_data)
    
    print("Sample DataFrame:")
    print(df[['session_id', 'walmart_complaint']].head())
    
    # Extract topics
    results = extractor.extract_topics_from_dataframe(df, 'walmart_complaint')
    
    # Save results
    extractor.save_results(results, "walmart_analysis_results.json")
    
    # Create and print summary report
    summary = extractor.create_summary_report(results)
    print("\n" + summary)
    
    return results

# Run the analysis
if __name__ == "__main__":
    results = main()



Sample DataFrame:
   session_id                                  walmart_complaint
0           1  The checkout lines are always too long and the...
1           2  Customer service is terrible, staff are rude a...
2           3  Store is always messy and items are never wher...
3           4     Prices keep going up but quality is going down
4           5  Self-checkout machines are always broken and t...
Starting Walmart complaint analysis...
Processing 5 complaints...
Processing batch 1 (5 complaints)...
Results saved to walmart_analysis_results.json


WALMART COMPLAINT ANALYSIS REPORT

OVERVIEW:
- Total Complaints Analyzed: 5
- Processing Batches: 1
- Categories Identified: 4
- Topics Identified: 2

TOP COMPLAINT CATEGORIES:
- Customer Service: 1 complaints
- Store Operations: 1 complaints
- Value Proposition: 1 complaints
- Technology: 1 complaints

TOP TOPICS:
- Staffing Levels: 1 mentions
- Service Quality: 1 mentions

KEY RECOMMENDATIONS:
1. Implement new staffing model based on 