In [None]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz, process
import re
import json

# Sample DataFrame setup
data = {
    'Ticket': range(1, 21),
    'Business Service': [
        'Microsoft Outlook', 'Microsoft Windows', 'Laptop', 'Python',
        'voice log', 'crm service', 'cwhh vdi', 'vantage agent portal',
        'Microsoft Excel', 'MS Excel', 'Excel 365', 'outlook email',
        'Outlook client', 'Windows 10', 'Microsoft Windows 11',
        'laptop hardware', 'laptop support', 'python development',
        'Python scripting', 'CRM system'
    ]
}
df = pd.DataFrame(data)

# Method 1: LLM-based approach
def llm_based_standardization(df, chat_function):
    """
    Use LLM to group similar services and standardize them
    """
    # Get unique services and their counts
    service_counts = df['Business Service'].value_counts()
    unique_services = service_counts.index.tolist()
    
    # Create prompt for LLM to group similar services
    prompt = f"""
    I have a list of business services that need to be standardized. Please group similar services together and for each group, suggest the most representative name (preferably the one that appears most frequently).

    Services with their counts:
    {dict(service_counts)}

    Please return a JSON object where:
    - Keys are the standardized service names
    - Values are lists of all variations that should map to that standardized name

    Example format:
    {{
        "Microsoft Excel": ["Microsoft Excel", "MS Excel", "Excel 365"],
        "Microsoft Outlook": ["Microsoft Outlook", "outlook email", "Outlook client"]
    }}

    Focus on grouping services that refer to the same underlying technology or service.
    """
    
    try:
        # Call the LLM function
        response = chat_function(prompt)
        
        # Parse the JSON response
        # Note: You might need to extract JSON from the response text
        standardization_map = json.loads(response)
        
        # Create a mapping from original to standardized
        service_mapping = {}
        for standard_name, variations in standardization_map.items():
            for variation in variations:
                service_mapping[variation] = standard_name
        
        # Apply mapping to create new column
        df['New Services'] = df['Business Service'].map(
            lambda x: service_mapping.get(x, x)
        )
        
        return df, service_mapping
        
    except Exception as e:
        print(f"Error in LLM processing: {e}")
        return df, {}

# Method 2: Traditional NLP approach
def traditional_nlp_standardization(df, similarity_threshold=80):
    """
    Use fuzzy matching and pattern recognition to standardize services
    """
    # Get service counts
    service_counts = df['Business Service'].value_counts()
    unique_services = list(service_counts.index)
    
    # Preprocessing function
    def preprocess_service(service):
        # Convert to lowercase, remove extra spaces, common prefixes/suffixes
        service = service.lower().strip()
        service = re.sub(r'\bmicrosoft\b|\bms\b', 'microsoft', service)
        service = re.sub(r'\b365\b|\boffice\b', '', service).strip()
        service = re.sub(r'\s+', ' ', service)
        return service
    
    # Create groups of similar services
    groups = []
    used_services = set()
    
    for service in unique_services:
        if service in used_services:
            continue
            
        # Find similar services
        similar_services = [service]
        used_services.add(service)
        
        for other_service in unique_services:
            if other_service in used_services:
                continue
                
            # Check similarity using multiple methods
            similarity_scores = [
                fuzz.ratio(preprocess_service(service), preprocess_service(other_service)),
                fuzz.partial_ratio(service.lower(), other_service.lower()),
                fuzz.token_sort_ratio(service.lower(), other_service.lower()),
                fuzz.token_set_ratio(service.lower(), other_service.lower())
            ]
            
            max_similarity = max(similarity_scores)
            
            # Also check for keyword overlap
            service_words = set(preprocess_service(service).split())
            other_words = set(preprocess_service(other_service).split())
            word_overlap = len(service_words & other_words) / max(len(service_words), len(other_words))
            
            if max_similarity >= similarity_threshold or word_overlap >= 0.6:
                similar_services.append(other_service)
                used_services.add(other_service)
        
        if len(similar_services) > 1:
            groups.append(similar_services)
    
    # For each group, find the service with highest count
    service_mapping = {}
    for group in groups:
        # Get counts for services in this group
        group_counts = {service: service_counts[service] for service in group}
        # Find the service with maximum count
        standard_service = max(group_counts, key=group_counts.get)
        
        # Map all services in group to the standard one
        for service in group:
            service_mapping[service] = standard_service
    
    # Apply mapping
    df['New Services'] = df['Business Service'].map(
        lambda x: service_mapping.get(x, x)
    )
    
    return df, service_mapping, groups

# Method 3: Hybrid approach
def hybrid_standardization(df, chat_function, similarity_threshold=75):
    """
    Combine traditional NLP for initial grouping and LLM for validation/refinement
    """
    # Step 1: Use traditional NLP for initial grouping
    df_temp, initial_mapping, groups = traditional_nlp_standardization(
        df.copy(), similarity_threshold
    )
    
    # Step 2: Use LLM to validate and refine the groups
    if groups:
        prompt = f"""
        I've used fuzzy matching to group similar business services. Please review these groups and suggest improvements:

        Groups found:
        {json.dumps(groups, indent=2)}

        Service counts:
        {dict(df['Business Service'].value_counts())}

        Please return a JSON object with refined groupings where:
        - Keys are the best representative names for each service category
        - Values are lists of all variations that should map to that name
        
        Validate that the groupings make sense semantically.
        """
        
        try:
            response = chat_function(prompt)
            refined_mapping = json.loads(response)
            
            # Create final mapping
            final_service_mapping = {}
            for standard_name, variations in refined_mapping.items():
                for variation in variations:
                    final_service_mapping[variation] = standard_name
            
            df['New Services'] = df['Business Service'].map(
                lambda x: final_service_mapping.get(x, x)
            )
            
            return df, final_service_mapping
            
        except Exception as e:
            print(f"Error in LLM refinement: {e}")
            return df_temp, initial_mapping
    
    return df_temp, initial_mapping

# Example usage and comparison
def compare_methods(df):
    """
    Compare the effectiveness of different methods
    """
    print("Original services:")
    print(df['Business Service'].value_counts())
    print("\n" + "="*50 + "\n")
    
    # Traditional NLP method
    df_traditional, mapping_traditional, groups = traditional_nlp_standardization(df.copy())
    print("Traditional NLP Results:")
    print("Groups found:", groups)
    print("Standardized services:")
    print(df_traditional['New Services'].value_counts())
    print(f"Reduced from {df['Business Service'].nunique()} to {df_traditional['New Services'].nunique()} unique services")
    
    return df_traditional, mapping_traditional

# Mock chat function for testing (replace with your actual LLM endpoint)
def mock_chat_function(prompt):
    """
    Mock LLM function - replace this with your actual chat function
    """
    # This is a mock response - your actual LLM should analyze the prompt
    mock_response = '''
    {
        "Microsoft Excel": ["Microsoft Excel", "MS Excel", "Excel 365"],
        "Microsoft Outlook": ["Microsoft Outlook", "outlook email", "Outlook client"],
        "Microsoft Windows": ["Microsoft Windows", "Windows 10", "Microsoft Windows 11"],
        "Laptop": ["Laptop", "laptop hardware", "laptop support"],
        "Python": ["Python", "python development", "Python scripting"],
        "CRM Service": ["crm service", "CRM system"],
        "Voice Log": ["voice log"],
        "CWHH VDI": ["cwhh vdi"],
        "Vantage Agent Portal": ["vantage agent portal"]
    }
    '''
    return mock_response

# Run comparison
if __name__ == "__main__":
    # Test traditional method
    df_result, mapping = compare_methods(df)
    
    print("\n" + "="*50 + "\n")
    print("Final DataFrame with standardized services:")
    print(df_result[['Business Service', 'New Services']].head(10))
    
    # Test LLM method (uncomment when you have actual chat function)
    # df_llm, mapping_llm = llm_based_standardization(df.copy(), your_chat_function)
    # print("LLM Results:", df_llm['New Services'].value_counts())

In [None]:
import pandas as pd
import json
import re
from typing import Dict, Tuple, List

def llm_service_standardization(df: pd.DataFrame, chat_function, 
                              service_column: str = 'Business Service',
                              new_column: str = 'New Services') -> Tuple[pd.DataFrame, Dict]:
    """
    Standardize business services using LLM with robust error handling
    
    Args:
        df: DataFrame containing the service data
        chat_function: Your LLM chat function
        service_column: Name of column containing services to standardize
        new_column: Name of new column for standardized services
    
    Returns:
        Tuple of (updated_dataframe, service_mapping_dict)
    """
    
    # Get service counts
    service_counts = df[service_column].value_counts()
    
    # Create the prompt
    prompt = create_standardization_prompt(service_counts)
    
    try:
        # Call your LLM
        print("Calling LLM for service standardization...")
        response = chat_function(prompt)
        print(f"LLM Response received: {len(response)} characters")
        
        # Parse the response
        standardization_map = parse_llm_response(response)
        
        # Validate the mapping
        standardization_map = validate_mapping(standardization_map, service_counts.index.tolist())
        
        # Apply the mapping
        df = apply_service_mapping(df, standardization_map, service_column, new_column)
        
        # Print results
        print_results(df, service_column, new_column, standardization_map)
        
        return df, standardization_map
        
    except Exception as e:
        print(f"Error in LLM standardization: {e}")
        # Fallback: copy original column
        df[new_column] = df[service_column]
        return df, {}

def create_standardization_prompt(service_counts) -> str:
    """Create a detailed prompt for the LLM"""
    
    services_text = "\n".join([f"- {service}: {count} occurrences" 
                              for service, count in service_counts.items()])
    
    prompt = f"""
You are helping standardize business service names in a IT ticketing system. 

TASK: Group similar services together and choose the best representative name for each group.

SERVICES AND THEIR FREQUENCIES:
{services_text}

RULES:
1. Group services that refer to the same underlying technology/service
2. For each group, choose the name with the HIGHEST frequency as the standard
3. If frequencies are equal, choose the most descriptive/official name
4. Keep unrelated services separate
5. Be conservative - only group if you're confident they're the same service

EXAMPLES of what should be grouped:
- "Microsoft Excel", "MS Excel", "Excel 365" → all refer to Excel
- "Microsoft Outlook", "Outlook email", "Outlook client" → all refer to Outlook
- "Windows 10", "Microsoft Windows 11", "Microsoft Windows" → all refer to Windows OS

EXAMPLES of what should NOT be grouped:
- "Python" and "Java" → different programming languages
- "Laptop" and "Desktop" → different hardware types
- "CRM" and "ERP" → different software categories

OUTPUT FORMAT:
Return ONLY a valid JSON object with this exact structure:
{{
    "Standard Service Name 1": ["variation1", "variation2", "variation3"],
    "Standard Service Name 2": ["variation1", "variation2"],
    "Ungrouped Service": ["Ungrouped Service"]
}}

IMPORTANT: 
- Include ALL original services in your response
- Each service should appear exactly once
- Use the exact service names from the input list
- Return only the JSON, no additional text
"""
    
    return prompt

def parse_llm_response(response: str) -> Dict:
    """Parse LLM response and extract JSON mapping"""
    
    # Try to find JSON in the response
    json_match = re.search(r'\{.*\}', response, re.DOTALL)
    
    if json_match:
        json_str = json_match.group()
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"JSON decode error: {e}")
            print(f"Problematic JSON: {json_str[:200]}...")
    
    # If direct parsing fails, try to clean the response
    try:
        # Remove markdown code blocks if present
        cleaned = re.sub(r'```json\s*|\s*```', '', response)
        cleaned = re.sub(r'```\s*|\s*```', '', cleaned)
        
        # Find the JSON object
        start = cleaned.find('{')
        end = cleaned.rfind('}') + 1
        
        if start != -1 and end > start:
            json_str = cleaned[start:end]
            return json.loads(json_str)
            
    except Exception as e:
        print(f"Error cleaning response: {e}")
    
    raise ValueError("Could not parse valid JSON from LLM response")

def validate_mapping(mapping: Dict, original_services: List[str]) -> Dict:
    """Validate and fix the LLM mapping"""
    
    # Flatten all mapped services
    mapped_services = set()
    for variations in mapping.values():
        mapped_services.update(variations)
    
    # Find missing services
    original_set = set(original_services)
    missing_services = original_set - mapped_services
    
    # Add missing services as standalone entries
    for service in missing_services:
        mapping[service] = [service]
        print(f"Added missing service: {service}")
    
    # Remove any services not in original list
    cleaned_mapping = {}
    for standard_name, variations in mapping.items():
        valid_variations = [v for v in variations if v in original_set]
        if valid_variations:
            cleaned_mapping[standard_name] = valid_variations
    
    return cleaned_mapping

def apply_service_mapping(df: pd.DataFrame, mapping: Dict, 
                         service_column: str, new_column: str) -> pd.DataFrame:
    """Apply the standardization mapping to the dataframe"""
    
    # Create reverse mapping: original_service -> standard_service
    service_map = {}
    for standard_name, variations in mapping.items():
        for variation in variations:
            service_map[variation] = standard_name
    
    # Apply mapping
    df[new_column] = df[service_column].map(service_map)
    
    # Handle any unmapped services (shouldn't happen with validation)
    unmapped_mask = df[new_column].isna()
    if unmapped_mask.any():
        print(f"Warning: {unmapped_mask.sum()} services couldn't be mapped")
        df.loc[unmapped_mask, new_column] = df.loc[unmapped_mask, service_column]
    
    return df

def print_results(df: pd.DataFrame, original_col: str, new_col: str, mapping: Dict):
    """Print standardization results"""
    
    print("\n" + "="*60)
    print("STANDARDIZATION RESULTS")
    print("="*60)
    
    original_unique = df[original_col].nunique()
    new_unique = df[new_col].nunique()
    reduction = original_unique - new_unique
    
    print(f"Original unique services: {original_unique}")
    print(f"Standardized unique services: {new_unique}")
    print(f"Reduction: {reduction} services ({reduction/original_unique*100:.1f}%)")
    
    print("\nGROUPINGS MADE:")
    for standard_name, variations in mapping.items():
        if len(variations) > 1:
            print(f"\n'{standard_name}' ← {variations}")
    
    print(f"\nFINAL SERVICE DISTRIBUTION:")
    print(df[new_col].value_counts().head(10))

# USAGE EXAMPLE WITH YOUR CHAT FUNCTION
def example_usage():
    """
    Example of how to use with your actual chat function
    """
    
    # Your DataFrame
    df = pd.DataFrame({
        'Ticket': range(1, 21),
        'Business Service': [
            'Microsoft Outlook', 'Microsoft Windows', 'Laptop', 'Python',
            'voice log', 'crm service', 'cwhh vdi', 'vantage agent portal',
            'Microsoft Excel', 'MS Excel', 'Excel 365', 'outlook email',
            'Outlook client', 'Windows 10', 'Microsoft Windows 11',
            'laptop hardware', 'laptop support', 'python development',
            'Python scripting', 'CRM system'
        ]
    })
    
    # Replace this with your actual chat function
    def your_chat_function(prompt):
        """
        Replace this with your actual LLM endpoint
        """
        # Example: return chat(prompt)
        # or: return openai.chat.completions.create(...)
        # or: return your_llm_api_call(prompt)
        
        # For now, using a mock response
        return '''
        {
            "Microsoft Excel": ["Microsoft Excel", "MS Excel", "Excel 365"],
            "Microsoft Outlook": ["Microsoft Outlook", "outlook email", "Outlook client"],
            "Microsoft Windows": ["Microsoft Windows", "Windows 10", "Microsoft Windows 11"],
            "Laptop": ["Laptop", "laptop hardware", "laptop support"],
            "Python": ["Python", "python development", "Python scripting"],
            "CRM service": ["crm service", "CRM system"],
            "voice log": ["voice log"],
            "cwhh vdi": ["cwhh vdi"],
            "vantage agent portal": ["vantage agent portal"]
        }
        '''
    
    # Run the standardization
    df_result, mapping = llm_service_standardization(
        df=df,
        chat_function=your_chat_function,  # Replace with your actual function
        service_column='Business Service',
        new_column='New Services'
    )
    
    return df_result, mapping

# INTEGRATION PATTERNS FOR DIFFERENT LLM PROVIDERS

def openai_integration_example():
    """Example for OpenAI API"""
    import openai
    
    def chat_with_openai(prompt):
        response = openai.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1  # Low temperature for consistent results
        )
        return response.choices[0].message.content

def anthropic_integration_example():
    """Example for Anthropic Claude API"""
    import anthropic
    
    client = anthropic.Anthropic(api_key="your-api-key")
    
    def chat_with_claude(prompt):
        response = client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=2000,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text

def custom_endpoint_example():
    """Example for custom API endpoint"""
    import requests
    
    def chat_with_custom_api(prompt):
        response = requests.post(
            "https://your-api-endpoint.com/chat",
            json={"prompt": prompt, "max_tokens": 2000},
            headers={"Authorization": "Bearer your-token"}
        )
        return response.json()["response"]

# Run the example
if __name__ == "__main__":
    df_result, mapping = example_usage()
    
    print("\nSample of final results:")
    print(df_result[['Business Service', 'New Services']].head(10))

In [None]:
# LLM-Based Incident Notes Extraction
# Extract "How broke?" and "How resolved?" from Notes field using LLM

import pandas as pd
import numpy as np
import json
import re
from typing import Dict, Tuple, List
import warnings
warnings.filterwarnings('ignore')

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

print("Libraries imported successfully!")

# =============================================================================
# STEP 1: DATA LOADING AND EXPLORATION
# =============================================================================

def load_and_explore_data(file_path=None, sample_data=True):
    """
    Load the incident data and perform initial exploration
    """
    if sample_data:
        # Create realistic sample data for demonstration
        np.random.seed(42)
        
        sample_notes = [
            "User reported Outlook crashes when opening large emails. Investigated and found PST file corruption. Rebuilt PST file and updated Outlook to latest version. Issue resolved.",
            "Server experiencing high CPU usage causing website timeouts. Identified memory leak in application code. Restarted services and applied hotfix patch. Performance back to normal.",
            "Printer showing paper jam error but no visible jam. Found sensor dirty from toner dust. Cleaned sensors and calibrated printer. Printing working correctly now.",
            "Network connection dropping randomly throughout office. Discovered faulty switch port. Replaced network switch and updated firmware. Connectivity stable.",
            "Blue screen errors on Windows 10 machine with memory fault codes. Ran diagnostics and found bad RAM stick. Replaced defective memory module. System stable.",
            "Excel files opening corrupted with missing data. Office installation was damaged. Uninstalled and reinstalled Office 365 suite. Files opening properly.",
            "VPN authentication failing for remote users. Certificate had expired on server. Renewed SSL certificates and updated VPN configuration. Remote access restored.",
            "Database queries running extremely slow during peak hours. Found missing indexes on large tables. Added proper indexes and optimized queries. Performance improved significantly.",
            "Phone system not receiving incoming calls. SIP trunk configuration was incorrect. Reconfigured trunk settings with carrier. Call routing working normally.",
            "Backup jobs failing with disk space errors. Log files consuming excessive storage. Cleaned old logs and increased partition size. Backups completing successfully.",
            "Email server rejecting messages with spam filter errors. Whitelist settings were too restrictive. Adjusted spam filter rules and retrained filters. Email flow restored.",
            "Laptop keyboard keys sticking and not responding. Coffee spill damage to internal components. Replaced keyboard assembly and cleaned logic board. Typing functional again.",
            "Website loading very slowly for all users. CDN cache was not updating properly. Purged CDN cache and fixed cache headers. Site speed back to normal.",
            "Scanner not detected by workstation. USB driver conflicts after Windows update. Uninstalled conflicting drivers and reinstalled scanner software. Scanning operational.",
            "Database connection timeouts in web application. Connection pool was exhausted during high traffic. Increased pool size and added connection retry logic. App responding normally.",
            "Monitor displaying purple tint and color distortion. Graphics driver was outdated and corrupted. Updated to latest display drivers and calibrated monitor. Colors accurate now.",
            "File server inaccessible with permission denied errors. Active Directory replication had failed. Forced AD sync and reset computer account. File access restored.",
            "Antivirus blocking legitimate software installation. False positive detection in security rules. Added software to exclusion list and updated virus definitions. Installation proceeded.",
            "Conference room camera not working for video calls. Camera driver was missing after system update. Reinstalled camera drivers and updated firmware. Video calls functional.",
            "POS system crashing during transaction processing. Database lock timeouts under load. Optimized database queries and increased timeout values. Transactions processing smoothly."
        ]
        
        # Extend to create more samples
        extended_notes = []
        for i in range(200):
            note = sample_notes[i % len(sample_notes)]
            # Add some variation
            variations = [
                note,
                note.replace("User reported", "Customer called about"),
                note.replace("Issue resolved", "Problem fixed"),
                note.replace("Found", "Discovered"),
                note.replace("working correctly", "functioning properly")
            ]
            extended_notes.append(variations[i % len(variations)])
        
        df = pd.DataFrame({
            'Ticket': [f'INC{2000 + i}' for i in range(len(extended_notes))],
            'Service Rep': np.random.choice(['Alice Johnson', 'Bob Smith', 'Carol Davis', 'David Wilson', 'Eva Brown'], len(extended_notes)),
            'description': [note.split('.')[0] + '.' for note in extended_notes],  # First sentence as description
            'close_notes': extended_notes  # Full note as close_notes
        })
    else:
        df = pd.read_csv(file_path)
        # Rename 'Notes' to 'close_notes' if needed
        if 'Notes' in df.columns and 'close_notes' not in df.columns:
            df = df.rename(columns={'Notes': 'close_notes'})
    
    print(f"Dataset loaded: {len(df)} incidents")
    print(f"Columns: {list(df.columns)}")
    print(f"\nSample close_notes:")
    for i, note in enumerate(df['close_notes'].head(3)):
        print(f"{i+1}. {note[:100]}...")
    
    return df

# Load data
df = load_and_explore_data()

# =============================================================================
# STEP 2: LLM EXTRACTION FUNCTIONS
# =============================================================================

def extract_how_broke_and_resolved(notes: str, chat_function) -> Tuple[str, str]:
    """
    Use LLM to extract 'how broke' and 'how resolved' from incident notes
    """
    prompt = f"""
You are analyzing IT incident notes to extract two key pieces of information.

INCIDENT NOTES:
{notes}

Extract the following information in exactly this format:

HOW BROKE: [What went wrong or failed - max 15 words, technical focus]
HOW RESOLVED: [How the issue was fixed - max 15 words, solution focus]

GUIDELINES:
- Focus on technical root cause for "HOW BROKE"
- Focus on the actual solution for "HOW RESOLVED"
- Use simple, clear technical language
- If information is unclear or missing, use "Unknown"
- Avoid user names, timestamps, non-technical details

EXAMPLES:

Example 1:
Notes: "User reported printer paper jam. Found sensor dirty from toner. Cleaned sensors. Printing works now."
HOW BROKE: printer sensor dirty from toner dust
HOW RESOLVED: cleaned dirty sensors and calibrated printer

Example 2:
Notes: "Server down with 500 errors. Memory leak in application. Restarted services and applied patch."
HOW BROKE: memory leak causing server errors
HOW RESOLVED: restarted services and applied hotfix patch

Example 3:
Notes: "Please help urgently! Something is broken!"
HOW BROKE: Unknown
HOW RESOLVED: Unknown

Now extract from the provided incident notes:
"""
    
    try:
        response = chat_function(prompt).strip()
        
        # Parse the response
        how_broke = "Unknown"
        how_resolved = "Unknown"
        
        # Look for the structured format
        broke_match = re.search(r'HOW BROKE:\s*(.+)', response, re.IGNORECASE)
        resolved_match = re.search(r'HOW RESOLVED:\s*(.+)', response, re.IGNORECASE)
        
        if broke_match:
            how_broke = broke_match.group(1).strip()
            # Clean up and limit words
            how_broke = ' '.join(how_broke.split()[:15])
        
        if resolved_match:
            how_resolved = resolved_match.group(1).strip()
            # Clean up and limit words  
            how_resolved = ' '.join(how_resolved.split()[:15])
        
        # Validate outputs
        if len(how_broke) > 100 or not how_broke or how_broke.lower() in ['unknown', 'unclear', 'not specified']:
            how_broke = "Unknown"
        
        if len(how_resolved) > 100 or not how_resolved or how_resolved.lower() in ['unknown', 'unclear', 'not specified']:
            how_resolved = "Unknown"
            
        return how_broke, how_resolved
        
    except Exception as e:
        print(f"Error extracting from notes: {e}")
        return "Unknown", "Unknown"

def batch_extract_incidents(df: pd.DataFrame, chat_function, 
                          batch_size: int = 1, sample_size: int = None) -> pd.DataFrame:
    """
    Process incident notes in batches to extract how broke and how resolved
    """
    df_work = df.copy()
    
    if sample_size:
        df_work = df_work.head(sample_size)
        print(f"Processing sample of {sample_size} incidents for demonstration")
    
    print(f"Extracting information from {len(df_work)} incident notes...")
    
    how_broke_list = []
    how_resolved_list = []
    
    for i, notes in enumerate(df_work['close_notes']):
        try:
            how_broke, how_resolved = extract_how_broke_and_resolved(notes, chat_function)
            how_broke_list.append(how_broke)
            how_resolved_list.append(how_resolved)
            
            # Progress update
            if (i + 1) % 10 == 0:
                print(f"Processed {i + 1}/{len(df_work)} incidents")
                
        except Exception as e:
            print(f"Error processing incident {i + 1}: {e}")
            how_broke_list.append("Unknown")
            how_resolved_list.append("Unknown")
    
    # Add new columns
    df_work['how broke'] = how_broke_list
    df_work['how resolved'] = how_resolved_list
    
    print("Extraction completed!")
    return df_work

# =============================================================================
# STEP 3: MOCK CHAT FUNCTION FOR DEMONSTRATION
# =============================================================================

def mock_chat_function(prompt: str) -> str:
    """
    Mock chat function for demonstration. Replace with your actual chat function.
    """
    # Extract the notes section from the prompt
    notes_match = re.search(r'INCIDENT NOTES:\s*(.+?)\s*Extract', prompt, re.DOTALL)
    if not notes_match:
        return "HOW BROKE: Unknown\nHOW RESOLVED: Unknown"
    
    notes = notes_match.group(1).strip().lower()
    
    # Pattern matching for common scenarios
    how_broke = "Unknown"
    how_resolved = "Unknown"
    
    # Analyze what broke
    if 'outlook' in notes and 'crash' in notes:
        how_broke = "outlook application crashes when opening emails"
    elif 'server' in notes and ('cpu' in notes or 'memory leak' in notes):
        how_broke = "server memory leak causing high cpu usage"
    elif 'printer' in notes and ('jam' in notes or 'sensor' in notes):
        how_broke = "printer sensor dirty from toner dust"
    elif 'network' in notes and ('dropping' in notes or 'connection' in notes):
        how_broke = "faulty network switch port causing disconnections"
    elif 'blue screen' in notes and 'memory' in notes:
        how_broke = "defective ram causing blue screen errors"
    elif 'excel' in notes and 'corrupt' in notes:
        how_broke = "office installation damaged corrupting excel files"
    elif 'vpn' in notes and ('authentication' in notes or 'certificate' in notes):
        how_broke = "expired ssl certificate blocking vpn authentication"
    elif 'database' in notes and 'slow' in notes:
        how_broke = "missing database indexes causing slow queries"
    elif 'backup' in notes and ('disk space' in notes or 'storage' in notes):
        how_broke = "excessive log files consuming backup storage space"
    elif 'email' in notes and ('spam' in notes or 'filter' in notes):
        how_broke = "restrictive spam filter blocking legitimate emails"
    elif 'keyboard' in notes and ('stick' in notes or 'spill' in notes):
        how_broke = "coffee spill damage to keyboard components"
    elif 'website' in notes and ('slow' in notes or 'cdn' in notes):
        how_broke = "cdn cache not updating causing slow loading"
    elif 'scanner' in notes and ('driver' in notes or 'usb' in notes):
        how_broke = "usb driver conflicts after windows update"
    elif 'timeout' in notes and ('connection' in notes or 'pool' in notes):
        how_broke = "database connection pool exhausted during peak traffic"
    elif 'monitor' in notes and ('color' in notes or 'display' in notes):
        how_broke = "outdated graphics driver causing color distortion"
    elif 'file server' in notes and 'permission' in notes:
        how_broke = "active directory replication failure blocking file access"
    elif 'antivirus' in notes and ('blocking' in notes or 'false positive' in notes):
        how_broke = "antivirus false positive blocking legitimate software installation"
    elif 'camera' in notes and ('driver' in notes or 'missing' in notes):
        how_broke = "missing camera driver after system update"
    elif 'pos' in notes and ('crash' in notes or 'transaction' in notes):
        how_broke = "database lock timeouts under transaction load"
    else:
        # Extract key failure words
        failure_keywords = ['crash', 'error', 'fail', 'down', 'slow', 'corrupt', 'timeout', 'block']
        for keyword in failure_keywords:
            if keyword in notes:
                how_broke = f"system {keyword} detected"
                break
    
    # Analyze how it was resolved
    if 'rebuilt pst' in notes or 'updated outlook' in notes:
        how_resolved = "rebuilt pst file and updated outlook version"
    elif 'restarted services' in notes and 'patch' in notes:
        how_resolved = "restarted services and applied hotfix patch"
    elif 'cleaned sensors' in notes:
        how_resolved = "cleaned dirty sensors and calibrated printer"
    elif 'replaced' in notes and 'switch' in notes:
        how_resolved = "replaced faulty network switch and updated firmware"
    elif 'replaced' in notes and 'memory' in notes:
        how_resolved = "replaced defective memory module"
    elif 'reinstalled office' in notes:
        how_resolved = "uninstalled and reinstalled office 365 suite"
    elif 'renewed' in notes and 'certificate' in notes:
        how_resolved = "renewed ssl certificates and updated vpn configuration"
    elif 'added indexes' in notes or 'optimized queries' in notes:
        how_resolved = "added database indexes and optimized queries"
    elif 'increased partition' in notes or 'cleaned logs' in notes:
        how_resolved = "cleaned old logs and increased partition size"
    elif 'adjusted' in notes and 'filter' in notes:
        how_resolved = "adjusted spam filter rules and retrained filters"
    elif 'replaced keyboard' in notes:
        how_resolved = "replaced keyboard assembly and cleaned logic board"
    elif 'purged cache' in notes:
        how_resolved = "purged cdn cache and fixed cache headers"
    elif 'reinstalled' in notes and 'driver' in notes:
        how_resolved = "uninstalled conflicting drivers and reinstalled scanner software"
    elif 'increased pool' in notes:
        how_resolved = "increased connection pool size and added retry logic"
    elif 'updated' in notes and 'driver' in notes:
        how_resolved = "updated display drivers and calibrated monitor"
    elif 'forced sync' in notes or 'reset account' in notes:
        how_resolved = "forced active directory sync and reset computer account"
    elif 'exclusion list' in notes:
        how_resolved = "added software to exclusion list and updated definitions"
    elif 'reinstalled camera' in notes:
        how_resolved = "reinstalled camera drivers and updated firmware"
    elif 'optimized database' in notes:
        how_resolved = "optimized database queries and increased timeout values"
    else:
        # Extract key resolution words
        resolution_keywords = ['restart', 'update', 'replace', 'install', 'configure', 'clean', 'repair']
        for keyword in resolution_keywords:
            if keyword in notes:
                how_resolved = f"performed {keyword} operation"
                break
    
    return f"HOW BROKE: {how_broke}\nHOW RESOLVED: {how_resolved}"

# =============================================================================
# STEP 4: TESTING THE EXTRACTION FUNCTION
# =============================================================================

print("\n" + "="*80)
print("TESTING EXTRACTION FUNCTION")
print("="*80)

# Test with sample notes
sample_notes = [
    "User reported Outlook crashes when opening large emails. Found PST file corruption. Rebuilt PST file and updated Outlook. Issue resolved.",
    "Server experiencing high CPU usage. Memory leak in application code. Restarted services and applied hotfix patch. Performance normal.",
    "Please help urgently! Something is broken but not sure what!",
    "Printer paper jam error but no visible jam. Sensor dirty from toner. Cleaned sensors and calibrated printer. Working correctly now."
]

print("Sample extractions:")
for i, notes in enumerate(sample_notes):
    how_broke, how_resolved = extract_how_broke_and_resolved(notes, mock_chat_function)
    print(f"\n{i+1}. Notes: {notes}")
    print(f"   How broke: {how_broke}")
    print(f"   How resolved: {how_resolved}")

# =============================================================================
# STEP 5: MAIN PROCESSING FUNCTION
# =============================================================================

def process_incident_notes(df: pd.DataFrame, chat_function, sample_size: int = None) -> pd.DataFrame:
    """
    Main function to process incident notes and extract how broke/resolved information
    """
    print("="*80)
    print("STARTING INCIDENT NOTES PROCESSING")
    print("="*80)
    
    # Validate input data
    if 'close_notes' not in df.columns:
        raise ValueError("DataFrame must contain 'close_notes' column")
    
    # Check for empty notes
    empty_notes = df['close_notes'].isna().sum()
    if empty_notes > 0:
        print(f"Warning: {empty_notes} incidents have empty notes")
        df['close_notes'] = df['close_notes'].fillna("No notes provided")
    
    # Process the notes
    df_result = batch_extract_incidents(df, chat_function, sample_size=sample_size)
    
    print("\nProcessing completed successfully!")
    return df_result

# =============================================================================
# STEP 6: ANALYSIS AND VISUALIZATION FUNCTIONS
# =============================================================================

def analyze_extraction_results(df: pd.DataFrame):
    """
    Analyze the results of the extraction process
    """
    print("="*80)
    print("ANALYSIS OF EXTRACTION RESULTS")
    print("="*80)
    
    total_incidents = len(df)
    
    # Basic statistics
    how_broke_unknown = (df['how broke'] == 'Unknown').sum()
    how_resolved_unknown = (df['how resolved'] == 'Unknown').sum()
    
    print(f"Total incidents processed: {total_incidents}")
    print(f"'How broke' unknown: {how_broke_unknown} ({how_broke_unknown/total_incidents*100:.1f}%)")
    print(f"'How resolved' unknown: {how_resolved_unknown} ({how_resolved_unknown/total_incidents*100:.1f}%)")
    
    # Top failure categories
    print(f"\nTop 10 'How broke' categories:")
    top_broke = df['how broke'].value_counts().head(10)
    for category, count in top_broke.items():
        print(f"  {category}: {count} ({count/total_incidents*100:.1f}%)")
    
    # Top resolution categories
    print(f"\nTop 10 'How resolved' categories:")
    top_resolved = df['how resolved'].value_counts().head(10)
    for category, count in top_resolved.items():
        print(f"  {category}: {count} ({count/total_incidents*100:.1f}%)")
    
    # Visualizations
    create_visualizations(df)
    
    return top_broke, top_resolved

def create_visualizations(df: pd.DataFrame):
    """
    Create visualizations for the extraction results
    """
    plt.figure(figsize=(20, 12))
    
    # Plot 1: How broke distribution
    plt.subplot(2, 3, 1)
    top_broke = df['how broke'].value_counts().head(10)
    plt.pie(top_broke.values, labels=[label[:30] + '...' if len(label) > 30 else label for label in top_broke.index], 
            autopct='%1.1f%%', startangle=90)
    plt.title('Top 10 "How Broke" Categories Distribution')
    
    # Plot 2: How resolved distribution
    plt.subplot(2, 3, 2)
    top_resolved = df['how resolved'].value_counts().head(10)
    plt.pie(top_resolved.values, labels=[label[:30] + '...' if len(label) > 30 else label for label in top_resolved.index], 
            autopct='%1.1f%%', startangle=90)
    plt.title('Top 10 "How Resolved" Categories Distribution')
    
    # Plot 3: How broke frequency
    plt.subplot(2, 3, 3)
    top_broke.plot(kind='barh')
    plt.title('Frequency of Top 10 "How Broke" Categories')
    plt.xlabel('Number of Incidents')
    
    # Plot 4: How resolved frequency
    plt.subplot(2, 3, 4)
    top_resolved.plot(kind='barh')
    plt.title('Frequency of Top 10 "How Resolved" Categories')
    plt.xlabel('Number of Incidents')
    
    # Plot 5: Unknown percentage by service rep
    plt.subplot(2, 3, 5)
    rep_stats = df.groupby('Service Rep').agg({
        'how broke': lambda x: (x == 'Unknown').sum() / len(x) * 100,
        'how resolved': lambda x: (x == 'Unknown').sum() / len(x) * 100
    })
    rep_stats.plot(kind='bar')
    plt.title('Unknown % by Service Rep')
    plt.ylabel('Percentage Unknown')
    plt.xticks(rotation=45)
    plt.legend(['How Broke', 'How Resolved'])
    
    # Plot 6: Word count distribution
    plt.subplot(2, 3, 6)
    word_counts_broke = df['how broke'].apply(lambda x: len(x.split()) if x != 'Unknown' else 0)
    word_counts_resolved = df['how resolved'].apply(lambda x: len(x.split()) if x != 'Unknown' else 0)
    
    plt.hist([word_counts_broke, word_counts_resolved], bins=15, alpha=0.7, label=['How Broke', 'How Resolved'])
    plt.title('Word Count Distribution')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# =============================================================================
# STEP 7: EXPORT FUNCTIONS
# =============================================================================

def export_results(df: pd.DataFrame, output_path: str = 'incident_extraction_results'):
    """
    Export results to CSV and generate summary report
    """
    # Export main dataset
    df.to_csv(f'{output_path}.csv', index=False)
    
    # Create summary report
    summary_report = {
        'total_incidents': len(df),
        'how_broke_unknown_count': (df['how broke'] == 'Unknown').sum(),
        'how_resolved_unknown_count': (df['how resolved'] == 'Unknown').sum(),
        'top_how_broke': df['how broke'].value_counts().head(10).to_dict(),
        'top_how_resolved': df['how resolved'].value_counts().head(10).to_dict(),
        'service_rep_stats': df.groupby('Service Rep').agg({
            'how broke': lambda x: (x == 'Unknown').sum(),
            'how resolved': lambda x: (x == 'Unknown').sum()
        }).to_dict()
    }
    
    with open(f'{output_path}_summary.json', 'w') as f:
        json.dump(summary_report, f, indent=2)
    
    print(f"Results exported to {output_path}.csv and {output_path}_summary.json")

# =============================================================================
# STEP 8: MAIN EXECUTION
# =============================================================================

def your_chat_function(prompt):
    """
    Replace this with your actual LLM chat function
    
    Example integrations:
    - return chat(prompt)
    - return openai_client.chat.completions.create(...)
    - return anthropic_client.messages.create(...)
    """
    return mock_chat_function(prompt)

# Run the main processing pipeline
print("Starting the incident notes extraction pipeline...")

# Process the data (using sample for demonstration)
df_final = process_incident_notes(
    df=df, 
    chat_function=your_chat_function,  # Replace with your actual function
    sample_size=50  # Remove this to process all data
)

# =============================================================================
# STEP 9: DISPLAY RESULTS
# =============================================================================

print("\n" + "="*80)
print("FINAL RESULTS SAMPLE")
print("="*80)

# Display sample results
sample_results = df_final[['Ticket', 'Service Rep', 'close_notes', 'how broke', 'how resolved']].head(10)
print("\nSample of extracted information:")
for _, row in sample_results.iterrows():
    print(f"\nTicket: {row['Ticket']}")
    print(f"Notes: {row['close_notes'][:100]}...")
    print(f"How broke: {row['how broke']}")
    print(f"How resolved: {row['how resolved']}")

# Run analysis
print("\n" + "="*80)
print("RUNNING ANALYSIS")
print("="*80)

top_broke, top_resolved = analyze_extraction_results(df_final)

# Export results
export_results(df_final)

print("\n" + "="*80)
print("PIPELINE COMPLETED SUCCESSFULLY!")
print("="*80)
print("\nTo use with your actual data:")
print("1. Replace 'your_chat_function' with your actual LLM chat function")
print("2. Load your CSV file: df = pd.read_csv('your_file.csv')")
print("3. Ensure your data has 'close_notes' or 'Notes' column")
print("4. Run: df_result = process_incident_notes(df, your_chat_function)")
print("5. Analyze results and export as needed")

print(f"\nNew columns added:")
print(f"- 'how broke': Technical description of what failed (max 15 words)")
print(f"- 'how resolved': Description of how issue was fixed (max 15 words)")