In [None]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz, process
import re
import json

# Sample DataFrame setup
data = {
    'Ticket': range(1, 21),
    'Business Service': [
        'Microsoft Outlook', 'Microsoft Windows', 'Laptop', 'Python',
        'voice log', 'crm service', 'cwhh vdi', 'vantage agent portal',
        'Microsoft Excel', 'MS Excel', 'Excel 365', 'outlook email',
        'Outlook client', 'Windows 10', 'Microsoft Windows 11',
        'laptop hardware', 'laptop support', 'python development',
        'Python scripting', 'CRM system'
    ]
}
df = pd.DataFrame(data)

# Method 1: LLM-based approach
def llm_based_standardization(df, chat_function):
    """
    Use LLM to group similar services and standardize them
    """
    # Get unique services and their counts
    service_counts = df['Business Service'].value_counts()
    unique_services = service_counts.index.tolist()
    
    # Create prompt for LLM to group similar services
    prompt = f"""
    I have a list of business services that need to be standardized. Please group similar services together and for each group, suggest the most representative name (preferably the one that appears most frequently).

    Services with their counts:
    {dict(service_counts)}

    Please return a JSON object where:
    - Keys are the standardized service names
    - Values are lists of all variations that should map to that standardized name

    Example format:
    {{
        "Microsoft Excel": ["Microsoft Excel", "MS Excel", "Excel 365"],
        "Microsoft Outlook": ["Microsoft Outlook", "outlook email", "Outlook client"]
    }}

    Focus on grouping services that refer to the same underlying technology or service.
    """
    
    try:
        # Call the LLM function
        response = chat_function(prompt)
        
        # Parse the JSON response
        # Note: You might need to extract JSON from the response text
        standardization_map = json.loads(response)
        
        # Create a mapping from original to standardized
        service_mapping = {}
        for standard_name, variations in standardization_map.items():
            for variation in variations:
                service_mapping[variation] = standard_name
        
        # Apply mapping to create new column
        df['New Services'] = df['Business Service'].map(
            lambda x: service_mapping.get(x, x)
        )
        
        return df, service_mapping
        
    except Exception as e:
        print(f"Error in LLM processing: {e}")
        return df, {}

# Method 2: Traditional NLP approach
def traditional_nlp_standardization(df, similarity_threshold=80):
    """
    Use fuzzy matching and pattern recognition to standardize services
    """
    # Get service counts
    service_counts = df['Business Service'].value_counts()
    unique_services = list(service_counts.index)
    
    # Preprocessing function
    def preprocess_service(service):
        # Convert to lowercase, remove extra spaces, common prefixes/suffixes
        service = service.lower().strip()
        service = re.sub(r'\bmicrosoft\b|\bms\b', 'microsoft', service)
        service = re.sub(r'\b365\b|\boffice\b', '', service).strip()
        service = re.sub(r'\s+', ' ', service)
        return service
    
    # Create groups of similar services
    groups = []
    used_services = set()
    
    for service in unique_services:
        if service in used_services:
            continue
            
        # Find similar services
        similar_services = [service]
        used_services.add(service)
        
        for other_service in unique_services:
            if other_service in used_services:
                continue
                
            # Check similarity using multiple methods
            similarity_scores = [
                fuzz.ratio(preprocess_service(service), preprocess_service(other_service)),
                fuzz.partial_ratio(service.lower(), other_service.lower()),
                fuzz.token_sort_ratio(service.lower(), other_service.lower()),
                fuzz.token_set_ratio(service.lower(), other_service.lower())
            ]
            
            max_similarity = max(similarity_scores)
            
            # Also check for keyword overlap
            service_words = set(preprocess_service(service).split())
            other_words = set(preprocess_service(other_service).split())
            word_overlap = len(service_words & other_words) / max(len(service_words), len(other_words))
            
            if max_similarity >= similarity_threshold or word_overlap >= 0.6:
                similar_services.append(other_service)
                used_services.add(other_service)
        
        if len(similar_services) > 1:
            groups.append(similar_services)
    
    # For each group, find the service with highest count
    service_mapping = {}
    for group in groups:
        # Get counts for services in this group
        group_counts = {service: service_counts[service] for service in group}
        # Find the service with maximum count
        standard_service = max(group_counts, key=group_counts.get)
        
        # Map all services in group to the standard one
        for service in group:
            service_mapping[service] = standard_service
    
    # Apply mapping
    df['New Services'] = df['Business Service'].map(
        lambda x: service_mapping.get(x, x)
    )
    
    return df, service_mapping, groups

# Method 3: Hybrid approach
def hybrid_standardization(df, chat_function, similarity_threshold=75):
    """
    Combine traditional NLP for initial grouping and LLM for validation/refinement
    """
    # Step 1: Use traditional NLP for initial grouping
    df_temp, initial_mapping, groups = traditional_nlp_standardization(
        df.copy(), similarity_threshold
    )
    
    # Step 2: Use LLM to validate and refine the groups
    if groups:
        prompt = f"""
        I've used fuzzy matching to group similar business services. Please review these groups and suggest improvements:

        Groups found:
        {json.dumps(groups, indent=2)}

        Service counts:
        {dict(df['Business Service'].value_counts())}

        Please return a JSON object with refined groupings where:
        - Keys are the best representative names for each service category
        - Values are lists of all variations that should map to that name
        
        Validate that the groupings make sense semantically.
        """
        
        try:
            response = chat_function(prompt)
            refined_mapping = json.loads(response)
            
            # Create final mapping
            final_service_mapping = {}
            for standard_name, variations in refined_mapping.items():
                for variation in variations:
                    final_service_mapping[variation] = standard_name
            
            df['New Services'] = df['Business Service'].map(
                lambda x: final_service_mapping.get(x, x)
            )
            
            return df, final_service_mapping
            
        except Exception as e:
            print(f"Error in LLM refinement: {e}")
            return df_temp, initial_mapping
    
    return df_temp, initial_mapping

# Example usage and comparison
def compare_methods(df):
    """
    Compare the effectiveness of different methods
    """
    print("Original services:")
    print(df['Business Service'].value_counts())
    print("\n" + "="*50 + "\n")
    
    # Traditional NLP method
    df_traditional, mapping_traditional, groups = traditional_nlp_standardization(df.copy())
    print("Traditional NLP Results:")
    print("Groups found:", groups)
    print("Standardized services:")
    print(df_traditional['New Services'].value_counts())
    print(f"Reduced from {df['Business Service'].nunique()} to {df_traditional['New Services'].nunique()} unique services")
    
    return df_traditional, mapping_traditional

# Mock chat function for testing (replace with your actual LLM endpoint)
def mock_chat_function(prompt):
    """
    Mock LLM function - replace this with your actual chat function
    """
    # This is a mock response - your actual LLM should analyze the prompt
    mock_response = '''
    {
        "Microsoft Excel": ["Microsoft Excel", "MS Excel", "Excel 365"],
        "Microsoft Outlook": ["Microsoft Outlook", "outlook email", "Outlook client"],
        "Microsoft Windows": ["Microsoft Windows", "Windows 10", "Microsoft Windows 11"],
        "Laptop": ["Laptop", "laptop hardware", "laptop support"],
        "Python": ["Python", "python development", "Python scripting"],
        "CRM Service": ["crm service", "CRM system"],
        "Voice Log": ["voice log"],
        "CWHH VDI": ["cwhh vdi"],
        "Vantage Agent Portal": ["vantage agent portal"]
    }
    '''
    return mock_response

# Run comparison
if __name__ == "__main__":
    # Test traditional method
    df_result, mapping = compare_methods(df)
    
    print("\n" + "="*50 + "\n")
    print("Final DataFrame with standardized services:")
    print(df_result[['Business Service', 'New Services']].head(10))
    
    # Test LLM method (uncomment when you have actual chat function)
    # df_llm, mapping_llm = llm_based_standardization(df.copy(), your_chat_function)
    # print("LLM Results:", df_llm['New Services'].value_counts())

In [None]:
import pandas as pd
import json
import re
from typing import Dict, Tuple, List

def llm_service_standardization(df: pd.DataFrame, chat_function, 
                              service_column: str = 'Business Service',
                              new_column: str = 'New Services') -> Tuple[pd.DataFrame, Dict]:
    """
    Standardize business services using LLM with robust error handling
    
    Args:
        df: DataFrame containing the service data
        chat_function: Your LLM chat function
        service_column: Name of column containing services to standardize
        new_column: Name of new column for standardized services
    
    Returns:
        Tuple of (updated_dataframe, service_mapping_dict)
    """
    
    # Get service counts
    service_counts = df[service_column].value_counts()
    
    # Create the prompt
    prompt = create_standardization_prompt(service_counts)
    
    try:
        # Call your LLM
        print("Calling LLM for service standardization...")
        response = chat_function(prompt)
        print(f"LLM Response received: {len(response)} characters")
        
        # Parse the response
        standardization_map = parse_llm_response(response)
        
        # Validate the mapping
        standardization_map = validate_mapping(standardization_map, service_counts.index.tolist())
        
        # Apply the mapping
        df = apply_service_mapping(df, standardization_map, service_column, new_column)
        
        # Print results
        print_results(df, service_column, new_column, standardization_map)
        
        return df, standardization_map
        
    except Exception as e:
        print(f"Error in LLM standardization: {e}")
        # Fallback: copy original column
        df[new_column] = df[service_column]
        return df, {}

def create_standardization_prompt(service_counts) -> str:
    """Create a detailed prompt for the LLM"""
    
    services_text = "\n".join([f"- {service}: {count} occurrences" 
                              for service, count in service_counts.items()])
    
    prompt = f"""
You are helping standardize business service names in a IT ticketing system. 

TASK: Group similar services together and choose the best representative name for each group.

SERVICES AND THEIR FREQUENCIES:
{services_text}

RULES:
1. Group services that refer to the same underlying technology/service
2. For each group, choose the name with the HIGHEST frequency as the standard
3. If frequencies are equal, choose the most descriptive/official name
4. Keep unrelated services separate
5. Be conservative - only group if you're confident they're the same service

EXAMPLES of what should be grouped:
- "Microsoft Excel", "MS Excel", "Excel 365" → all refer to Excel
- "Microsoft Outlook", "Outlook email", "Outlook client" → all refer to Outlook
- "Windows 10", "Microsoft Windows 11", "Microsoft Windows" → all refer to Windows OS

EXAMPLES of what should NOT be grouped:
- "Python" and "Java" → different programming languages
- "Laptop" and "Desktop" → different hardware types
- "CRM" and "ERP" → different software categories

OUTPUT FORMAT:
Return ONLY a valid JSON object with this exact structure:
{{
    "Standard Service Name 1": ["variation1", "variation2", "variation3"],
    "Standard Service Name 2": ["variation1", "variation2"],
    "Ungrouped Service": ["Ungrouped Service"]
}}

IMPORTANT: 
- Include ALL original services in your response
- Each service should appear exactly once
- Use the exact service names from the input list
- Return only the JSON, no additional text
"""
    
    return prompt

def parse_llm_response(response: str) -> Dict:
    """Parse LLM response and extract JSON mapping"""
    
    # Try to find JSON in the response
    json_match = re.search(r'\{.*\}', response, re.DOTALL)
    
    if json_match:
        json_str = json_match.group()
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"JSON decode error: {e}")
            print(f"Problematic JSON: {json_str[:200]}...")
    
    # If direct parsing fails, try to clean the response
    try:
        # Remove markdown code blocks if present
        cleaned = re.sub(r'```json\s*|\s*```', '', response)
        cleaned = re.sub(r'```\s*|\s*```', '', cleaned)
        
        # Find the JSON object
        start = cleaned.find('{')
        end = cleaned.rfind('}') + 1
        
        if start != -1 and end > start:
            json_str = cleaned[start:end]
            return json.loads(json_str)
            
    except Exception as e:
        print(f"Error cleaning response: {e}")
    
    raise ValueError("Could not parse valid JSON from LLM response")

def validate_mapping(mapping: Dict, original_services: List[str]) -> Dict:
    """Validate and fix the LLM mapping"""
    
    # Flatten all mapped services
    mapped_services = set()
    for variations in mapping.values():
        mapped_services.update(variations)
    
    # Find missing services
    original_set = set(original_services)
    missing_services = original_set - mapped_services
    
    # Add missing services as standalone entries
    for service in missing_services:
        mapping[service] = [service]
        print(f"Added missing service: {service}")
    
    # Remove any services not in original list
    cleaned_mapping = {}
    for standard_name, variations in mapping.items():
        valid_variations = [v for v in variations if v in original_set]
        if valid_variations:
            cleaned_mapping[standard_name] = valid_variations
    
    return cleaned_mapping

def apply_service_mapping(df: pd.DataFrame, mapping: Dict, 
                         service_column: str, new_column: str) -> pd.DataFrame:
    """Apply the standardization mapping to the dataframe"""
    
    # Create reverse mapping: original_service -> standard_service
    service_map = {}
    for standard_name, variations in mapping.items():
        for variation in variations:
            service_map[variation] = standard_name
    
    # Apply mapping
    df[new_column] = df[service_column].map(service_map)
    
    # Handle any unmapped services (shouldn't happen with validation)
    unmapped_mask = df[new_column].isna()
    if unmapped_mask.any():
        print(f"Warning: {unmapped_mask.sum()} services couldn't be mapped")
        df.loc[unmapped_mask, new_column] = df.loc[unmapped_mask, service_column]
    
    return df

def print_results(df: pd.DataFrame, original_col: str, new_col: str, mapping: Dict):
    """Print standardization results"""
    
    print("\n" + "="*60)
    print("STANDARDIZATION RESULTS")
    print("="*60)
    
    original_unique = df[original_col].nunique()
    new_unique = df[new_col].nunique()
    reduction = original_unique - new_unique
    
    print(f"Original unique services: {original_unique}")
    print(f"Standardized unique services: {new_unique}")
    print(f"Reduction: {reduction} services ({reduction/original_unique*100:.1f}%)")
    
    print("\nGROUPINGS MADE:")
    for standard_name, variations in mapping.items():
        if len(variations) > 1:
            print(f"\n'{standard_name}' ← {variations}")
    
    print(f"\nFINAL SERVICE DISTRIBUTION:")
    print(df[new_col].value_counts().head(10))

# USAGE EXAMPLE WITH YOUR CHAT FUNCTION
def example_usage():
    """
    Example of how to use with your actual chat function
    """
    
    # Your DataFrame
    df = pd.DataFrame({
        'Ticket': range(1, 21),
        'Business Service': [
            'Microsoft Outlook', 'Microsoft Windows', 'Laptop', 'Python',
            'voice log', 'crm service', 'cwhh vdi', 'vantage agent portal',
            'Microsoft Excel', 'MS Excel', 'Excel 365', 'outlook email',
            'Outlook client', 'Windows 10', 'Microsoft Windows 11',
            'laptop hardware', 'laptop support', 'python development',
            'Python scripting', 'CRM system'
        ]
    })
    
    # Replace this with your actual chat function
    def your_chat_function(prompt):
        """
        Replace this with your actual LLM endpoint
        """
        # Example: return chat(prompt)
        # or: return openai.chat.completions.create(...)
        # or: return your_llm_api_call(prompt)
        
        # For now, using a mock response
        return '''
        {
            "Microsoft Excel": ["Microsoft Excel", "MS Excel", "Excel 365"],
            "Microsoft Outlook": ["Microsoft Outlook", "outlook email", "Outlook client"],
            "Microsoft Windows": ["Microsoft Windows", "Windows 10", "Microsoft Windows 11"],
            "Laptop": ["Laptop", "laptop hardware", "laptop support"],
            "Python": ["Python", "python development", "Python scripting"],
            "CRM service": ["crm service", "CRM system"],
            "voice log": ["voice log"],
            "cwhh vdi": ["cwhh vdi"],
            "vantage agent portal": ["vantage agent portal"]
        }
        '''
    
    # Run the standardization
    df_result, mapping = llm_service_standardization(
        df=df,
        chat_function=your_chat_function,  # Replace with your actual function
        service_column='Business Service',
        new_column='New Services'
    )
    
    return df_result, mapping

# INTEGRATION PATTERNS FOR DIFFERENT LLM PROVIDERS

def openai_integration_example():
    """Example for OpenAI API"""
    import openai
    
    def chat_with_openai(prompt):
        response = openai.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1  # Low temperature for consistent results
        )
        return response.choices[0].message.content

def anthropic_integration_example():
    """Example for Anthropic Claude API"""
    import anthropic
    
    client = anthropic.Anthropic(api_key="your-api-key")
    
    def chat_with_claude(prompt):
        response = client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=2000,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text

def custom_endpoint_example():
    """Example for custom API endpoint"""
    import requests
    
    def chat_with_custom_api(prompt):
        response = requests.post(
            "https://your-api-endpoint.com/chat",
            json={"prompt": prompt, "max_tokens": 2000},
            headers={"Authorization": "Bearer your-token"}
        )
        return response.json()["response"]

# Run the example
if __name__ == "__main__":
    df_result, mapping = example_usage()
    
    print("\nSample of final results:")
    print(df_result[['Business Service', 'New Services']].head(10))

In [None]:
# second
# IT Incident Description Clustering and Categorization
# This notebook processes IT incident descriptions to create standardized "How broke?" categories

import pandas as pd
import numpy as np
import json
import re
from typing import List, Dict, Tuple
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# For clustering (install if needed: pip install scikit-learn sentence-transformers)
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")

# =============================================================================
# STEP 1: DATA LOADING AND EXPLORATION
# =============================================================================

def load_and_explore_data(file_path=None, sample_data=True):
    """
    Load the incident data and perform initial exploration
    """
    if sample_data:
        # Create sample data for demonstration
        np.random.seed(42)
        sample_descriptions = [
            "User cannot access email, Outlook keeps crashing when opening",
            "Blue screen of death on Windows 10 machine, memory error",
            "Network connection dropped, cannot connect to WiFi",
            "Printer not working, paper jam error message displayed",
            "Application crashed unexpectedly, database connection timeout",
            "Hard drive making clicking noises, files corrupted",
            "Monitor displaying strange colors, graphics driver issue",
            "Keyboard keys not responding, spilled coffee on device",
            "Server down, 500 internal error on website",
            "VPN connection failed, authentication error",
            "Excel file won't open, file format corrupted",
            "Phone not charging, battery seems dead",
            "Software installation failed, permission denied error",
            "Internet browser crashing, memory leak suspected",
            "Microphone not working during video call",
            "Password reset not working, system locked account",
            "Database query running slow, table lock timeout",
            "Mobile app keeps freezing, needs force restart",
            "Backup failed, disk space insufficient",
            "Login page not loading, SSL certificate expired"
        ] * 50  # Repeat to get 1000 samples
        
        df = pd.DataFrame({
            'Ticket': [f'INC{1000 + i}' for i in range(len(sample_descriptions))],
            'Service Rep': np.random.choice(['John Smith', 'Jane Doe', 'Mike Johnson', 'Sarah Wilson'], len(sample_descriptions)),
            'description': sample_descriptions,
            'close_notes': ['Resolved by ' + action for action in np.random.choice([
                'restarting service', 'updating drivers', 'replacing hardware', 
                'clearing cache', 'reinstalling software'
            ], len(sample_descriptions))]
        })
    else:
        df = pd.read_csv(file_path)
    
    print(f"Dataset loaded: {len(df)} incidents")
    print(f"Columns: {list(df.columns)}")
    print("\nSample descriptions:")
    for i, desc in enumerate(df['description'].head(3)):
        print(f"{i+1}. {desc}")
    
    return df

# Load data
df = load_and_explore_data()

# =============================================================================
# STEP 2: TEXT CLEANING FUNCTIONS USING LLM
# =============================================================================

def clean_description_with_llm(description: str, chat_function) -> str:
    """
    Use LLM to extract only technical failure information from description
    """
    prompt = f"""
You are helping to clean IT incident descriptions. Extract ONLY the technical information about what failed or broke.

ORIGINAL DESCRIPTION:
{description}

INSTRUCTIONS:
1. Keep only technical terms describing the failure/problem
2. Remove user names, timestamps, non-technical details, politeness words
3. Keep hardware/software names, error messages, technical symptoms
4. If the description has no meaningful technical content, return "meaningless"
5. Use maximum 15 words
6. Focus on WHAT broke, not WHO reported it or WHEN

EXAMPLES:
Input: "Hi John, my computer won't start this morning, can you help?"
Output: "computer won't start"

Input: "Dear IT team, I hope you're well. The printer in room 205 has a paper jam again."
Output: "printer paper jam"

Input: "Hello, this is weird but my screen is blue and shows memory error"
Output: "blue screen memory error"

Input: "Please help me ASAP!!! This is urgent!!!"
Output: "meaningless"

Return only the cleaned technical description, no explanations:
"""
    
    try:
        response = chat_function(prompt).strip()
        # Handle edge cases
        if len(response) > 80 or 'meaningless' in response.lower():
            return 'Unknown'
        return response
    except Exception as e:
        print(f"Error cleaning description: {e}")
        return 'Unknown'

def batch_clean_descriptions(df: pd.DataFrame, chat_function, 
                           batch_size: int = 10, sample_size: int = None) -> pd.DataFrame:
    """
    Clean descriptions in batches to optimize LLM calls
    """
    df_work = df.copy()
    
    if sample_size:
        df_work = df_work.head(sample_size)
    
    print(f"Cleaning {len(df_work)} descriptions...")
    
    # For demonstration, we'll clean the first few and then use pattern matching for the rest
    # In production, you might want to clean all descriptions
    
    cleaned_descriptions = []
    for i, desc in enumerate(df_work['description']):
        if i < 20:  # Clean first 20 with LLM for demo
            cleaned = clean_description_with_llm(desc, chat_function)
            cleaned_descriptions.append(cleaned)
            if i % 5 == 0:
                print(f"Cleaned {i+1}/{len(df_work)}")
        else:
            # For demo purposes, use pattern-based cleaning for the rest
            cleaned = pattern_based_cleaning(desc)
            cleaned_descriptions.append(cleaned)
    
    df_work['cleaned_description'] = cleaned_descriptions
    print("Description cleaning completed!")
    
    return df_work

def pattern_based_cleaning(description: str) -> str:
    """
    Fallback pattern-based cleaning for efficiency
    """
    # Convert to lowercase
    text = description.lower()
    
    # Remove common non-technical phrases
    remove_patterns = [
        r'\b(hi|hello|dear|please|help|urgent|asap|thanks?|regards?)\b',
        r'\b(morning|afternoon|evening|today|yesterday|tomorrow)\b',
        r'\b(room \d+|floor \d+|building \w+)\b',
        r'\b(user|employee|staff|person) \w+\b'
    ]
    
    for pattern in remove_patterns:
        text = re.sub(pattern, '', text)
    
    # Extract technical terms
    technical_keywords = [
        'error', 'crash', 'fail', 'down', 'slow', 'freeze', 'hang', 'timeout',
        'connection', 'network', 'wifi', 'internet', 'server', 'database',
        'printer', 'scanner', 'monitor', 'keyboard', 'mouse', 'laptop', 'desktop',
        'windows', 'office', 'outlook', 'excel', 'word', 'browser', 'chrome',
        'memory', 'disk', 'hardware', 'software', 'driver', 'update', 'install',
        'login', 'password', 'access', 'permission', 'denied', 'expired'
    ]
    
    words = text.split()
    technical_words = [word for word in words if any(kw in word for kw in technical_keywords)]
    
    if len(technical_words) == 0:
        return 'Unknown'
    
    # Keep only first 5 meaningful words
    return ' '.join(technical_words[:5])

# =============================================================================
# STEP 3: MOCK CHAT FUNCTION FOR DEMONSTRATION
# =============================================================================

def mock_chat_function(prompt: str) -> str:
    """
    Mock chat function for demonstration. Replace with your actual chat function.
    """
    # Simple pattern matching for demo purposes
    text = prompt.lower()
    
    if 'blue screen' in text and 'memory' in text:
        return 'blue screen memory error'
    elif 'printer' in text and 'jam' in text:
        return 'printer paper jam'
    elif 'email' in text and 'crash' in text:
        return 'email application crash'
    elif 'network' in text or 'wifi' in text:
        return 'network connection issue'
    elif 'server' in text and 'down' in text:
        return 'server downtime'
    elif 'password' in text:
        return 'authentication failure'
    elif 'install' in text and 'fail' in text:
        return 'software installation failure'
    elif 'urgent' in text and 'help' in text and 'please' in text:
        return 'meaningless'
    else:
        # Extract key technical words
        technical_terms = []
        for word in ['crash', 'error', 'fail', 'slow', 'freeze', 'timeout', 'down']:
            if word in text:
                technical_terms.append(word)
        
        if technical_terms:
            return f"system {technical_terms[0]}"
        return 'hardware malfunction'

# Test the cleaning function
print("\n" + "="*60)
print("TESTING DESCRIPTION CLEANING")
print("="*60)

sample_descriptions = [
    "Hi there, my printer won't work and shows paper jam error",
    "Urgent! Computer blue screen with memory error message!",
    "Hello IT team, hope you're well, just wanted to report network issue",
    "Please help ASAP!!! This is very urgent!!!"
]

for desc in sample_descriptions:
    cleaned = clean_description_with_llm(desc, mock_chat_function)
    print(f"Original: {desc}")
    print(f"Cleaned:  {cleaned}")
    print()

# =============================================================================
# STEP 4: CLUSTERING FUNCTIONS
# =============================================================================

def create_failure_categories_with_llm(cleaned_descriptions: List[str], 
                                     chat_function, target_clusters: int = 100) -> Dict[str, List[str]]:
    """
    Use LLM to create failure categories from cleaned descriptions
    """
    # Get unique descriptions with their frequencies
    desc_counts = Counter(cleaned_descriptions)
    unique_descriptions = [desc for desc in desc_counts.keys() if desc != 'Unknown']
    
    # Create batches for LLM processing
    batch_size = 20
    all_categories = {}
    
    for i in range(0, len(unique_descriptions), batch_size):
        batch = unique_descriptions[i:i+batch_size]
        
        prompt = f"""
You are categorizing IT incident types. Group similar technical failures together and create category names.

FAILURE DESCRIPTIONS TO CATEGORIZE:
{chr(10).join([f"- {desc}" for desc in batch])}

INSTRUCTIONS:
1. Group similar technical failures together
2. Create concise category names (max 5 words)
3. Category names should be technical and descriptive
4. Examples of good category names:
   - "Email Application Crash"
   - "Network Connectivity Failure" 
   - "Hardware Driver Issue"
   - "Database Connection Timeout"
   - "Authentication System Error"

5. Return JSON format:
{{
    "Category Name 1": ["description1", "description2"],
    "Category Name 2": ["description3"],
    ...
}}

Focus on the technical root cause, not symptoms. Return only valid JSON:
"""
        
        try:
            response = chat_function(prompt)
            # Parse JSON response
            batch_categories = json.loads(response.strip())
            all_categories.update(batch_categories)
        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            # Fallback: create individual categories
            for desc in batch:
                all_categories[desc.title()] = [desc]
    
    return all_categories

def traditional_clustering_approach(cleaned_descriptions: List[str], 
                                  n_clusters: int = 100) -> Tuple[Dict[str, List[str]], np.ndarray]:
    """
    Alternative clustering using TF-IDF and K-means
    """
    # Filter out 'Unknown' values
    valid_descriptions = [desc for desc in cleaned_descriptions if desc != 'Unknown']
    
    if len(set(valid_descriptions)) < n_clusters:
        n_clusters = len(set(valid_descriptions))
    
    # Vectorize the descriptions
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 2))
    X = vectorizer.fit_transform(valid_descriptions)
    
    # Perform clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X)
    
    # Create categories from clusters
    categories = {}
    for i in range(n_clusters):
        cluster_descriptions = [desc for j, desc in enumerate(valid_descriptions) if cluster_labels[j] == i]
        
        if cluster_descriptions:
            # Create category name from most common words in cluster
            all_words = ' '.join(cluster_descriptions).split()
            word_freq = Counter(all_words)
            top_words = [word for word, _ in word_freq.most_common(3)]
            category_name = ' '.join(top_words).title()
            
            categories[category_name] = list(set(cluster_descriptions))
    
    return categories, cluster_labels

# =============================================================================
# STEP 5: CATEGORY ASSIGNMENT AND FINAL PROCESSING
# =============================================================================

def assign_categories_to_incidents(df: pd.DataFrame, categories: Dict[str, List[str]], 
                                 chat_function) -> pd.DataFrame:
    """
    Assign category names to each incident based on cleaned descriptions
    """
    df_result = df.copy()
    
    # Create reverse mapping: description -> category
    desc_to_category = {}
    for category_name, descriptions in categories.items():
        for desc in descriptions:
            desc_to_category[desc] = category_name
    
    # Assign categories
    how_broke_values = []
    
    for cleaned_desc in df_result['cleaned_description']:
        if cleaned_desc in desc_to_category:
            how_broke_values.append(desc_to_category[cleaned_desc])
        elif cleaned_desc == 'Unknown':
            how_broke_values.append('Unknown')
        else:
            # Use LLM to find best matching category
            best_match = find_best_category_match(cleaned_desc, categories, chat_function)
            how_broke_values.append(best_match)
    
    df_result['How broke?'] = how_broke_values
    return df_result

def find_best_category_match(description: str, categories: Dict[str, List[str]], 
                           chat_function) -> str:
    """
    Find the best matching category for a description using LLM
    """
    category_names = list(categories.keys())[:20]  # Limit for prompt size
    
    prompt = f"""
Find the best matching category for this technical description.

DESCRIPTION: {description}

AVAILABLE CATEGORIES:
{chr(10).join([f"- {cat}" for cat in category_names])}

Return only the category name that best matches, or "Unknown" if none fit well:
"""
    
    try:
        response = chat_function(prompt).strip()
        if response in category_names:
            return response
        else:
            return 'Unknown'
    except:
        return 'Unknown'

# =============================================================================
# STEP 6: MAIN PROCESSING PIPELINE
# =============================================================================

def process_incident_descriptions(df: pd.DataFrame, chat_function, 
                                target_clusters: int = 100, 
                                use_llm_clustering: bool = True) -> pd.DataFrame:
    """
    Main pipeline to process incident descriptions and create 'How broke?' field
    """
    print("="*60)
    print("STARTING INCIDENT DESCRIPTION PROCESSING PIPELINE")
    print("="*60)
    
    # Step 1: Clean descriptions
    print("\n1. Cleaning descriptions...")
    df_cleaned = batch_clean_descriptions(df, chat_function, sample_size=100)
    
    # Step 2: Create categories/clusters
    print("\n2. Creating failure categories...")
    if use_llm_clustering:
        categories = create_failure_categories_with_llm(
            df_cleaned['cleaned_description'].tolist(), 
            chat_function, 
            target_clusters
        )
    else:
        categories, _ = traditional_clustering_approach(
            df_cleaned['cleaned_description'].tolist(), 
            target_clusters
        )
    
    print(f"Created {len(categories)} categories")
    
    # Step 3: Assign categories to incidents
    print("\n3. Assigning categories to incidents...")
    df_final = assign_categories_to_incidents(df_cleaned, categories, chat_function)
    
    print("\n4. Processing complete!")
    return df_final, categories

# =============================================================================
# STEP 7: ANALYSIS AND VISUALIZATION
# =============================================================================

def analyze_results(df: pd.DataFrame, categories: Dict[str, List[str]]):
    """
    Analyze and visualize the categorization results
    """
    print("="*60)
    print("ANALYSIS OF CATEGORIZATION RESULTS")
    print("="*60)
    
    # Basic statistics
    total_incidents = len(df)
    unique_categories = df['How broke?'].nunique()
    unknown_count = (df['How broke?'] == 'Unknown').sum()
    
    print(f"Total incidents processed: {total_incidents}")
    print(f"Unique categories created: {unique_categories}")
    print(f"Unknown/uncategorized: {unknown_count} ({unknown_count/total_incidents*100:.1f}%)")
    
    # Top categories
    print(f"\nTop 10 most common failure categories:")
    top_categories = df['How broke?'].value_counts().head(10)
    for category, count in top_categories.items():
        print(f"  {category}: {count} incidents ({count/total_incidents*100:.1f}%)")
    
    # Visualization
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Category distribution
    plt.subplot(2, 2, 1)
    top_15 = df['How broke?'].value_counts().head(15)
    plt.pie(top_15.values, labels=top_15.index, autopct='%1.1f%%')
    plt.title('Distribution of Top 15 Failure Categories')
    plt.xticks(rotation=45, ha='right')
    
    # Plot 2: Category frequency
    plt.subplot(2, 2, 2)
    top_15.plot(kind='bar')
    plt.title('Frequency of Top 15 Categories')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Number of Incidents')
    
    # Plot 3: Categories by service rep
    plt.subplot(2, 2, 3)
    rep_category = pd.crosstab(df['Service Rep'], df['How broke?'])
    rep_category.sum(axis=1).plot(kind='bar')
    plt.title('Incidents by Service Rep')
    plt.ylabel('Total Incidents')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    return top_categories

# =============================================================================
# STEP 8: RUNNING THE COMPLETE PIPELINE
# =============================================================================

# Replace this with your actual chat function
def your_chat_function(prompt):
    """
    Replace this with your actual LLM chat function
    """
    return mock_chat_function(prompt)

# Run the complete pipeline
print("Starting the complete incident categorization pipeline...")

df_final, categories = process_incident_descriptions(
    df=df, 
    chat_function=your_chat_function,  # Replace with your actual function
    target_clusters=50,  # Reduced for demo
    use_llm_clustering=True
)

# Analyze results
print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)

print("\nSample of final dataset:")
print(df_final[['Ticket', 'description', 'cleaned_description', 'How broke?']].head(10))

print(f"\nTotal categories created: {len(categories)}")
print("\nSample categories:")
for i, (cat_name, descriptions) in enumerate(list(categories.items())[:5]):
    print(f"{i+1}. {cat_name}")
    print(f"   Includes: {descriptions[:3]}...")

# Run analysis
top_categories = analyze_results(df_final, categories)

# =============================================================================
# STEP 9: EXPORT FUNCTIONS
# =============================================================================

def export_results(df: pd.DataFrame, categories: Dict[str, List[str]], 
                  output_path: str = 'incident_categorization_results'):
    """
    Export results to various formats
    """
    # Export main dataset
    df.to_csv(f'{output_path}_data.csv', index=False)
    
    # Export categories
    with open(f'{output_path}_categories.json', 'w') as f:
        json.dump(categories, f, indent=2)
    
    # Export summary
    summary = {
        'total_incidents': len(df),
        'unique_categories': df['How broke?'].nunique(),
        'unknown_count': (df['How broke?'] == 'Unknown').sum(),
        'top_categories': df['How broke?'].value_counts().head(10).to_dict()
    }
    
    with open(f'{output_path}_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"Results exported to {output_path}_*.csv/json")

# Export the results
export_results(df_final, categories)

print("\n" + "="*60)
print("PIPELINE COMPLETED SUCCESSFULLY!")
print("="*60)
print("\nTo use with your actual data:")
print("1. Replace 'your_chat_function' with your actual LLM endpoint")
print("2. Load your CSV file instead of using sample data")
print("3. Adjust target_clusters parameter as needed")
print("4. Run the pipeline and analyze results")