# KWIC Analysis Example

## Key word in context analysis

### Example Sentence:
**"The dog barked loudly at the stranger."**

If we want to do KWIC analysis on the word **"dog"**, we'd look at how it’s used in context.

### KWIC Output for "dog":


...The dog barked loudly... ...dog barked loudly at...

Here’s how this helps:

1. **Context of "dog"**: In the first part, it’s part of the phrase "The dog barked loudly". In the second, it’s in "dog barked loudly at". By seeing these parts together, we can understand how **"dog"** is behaving in the sentence.

This is KWIC! It shows the word **"dog"** in the context where it's used, helping us understand its role in the sentence.

In [2]:
import warnings
warnings.filterwarnings('ignore')  # Suppress warning messages

import json
import pandas as pd
import numpy as np
from nltk import word_tokenize
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

def auto_kwic_analysis(text, window_size=5, num_keywords=15):
    """
    Full automated KWIC pipeline with keyword detection
    :param text: Input text (cleaned/summarized)
    :param window_size: Context words around keywords
    :param num_keywords: Number of keywords to auto-detect
    :return: KWIC DataFrame
    """
    try:
        # 1. Auto-detect keywords using TF-IDF with n-grams
        tfidf = TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=num_keywords,
            stop_words='english'
        )
        text_features = tfidf.fit_transform([text])
        keywords = tfidf.get_feature_names_out()
        
        # 2. Process text for KWIC
        tokens = word_tokenize(text)
        concordance = {
            'keyword': [],
            'left_context': [],
            'right_context': [],
            'position': [],
            'full_context': []
        }
        
        # 3. Find all keyword occurrences
        for i, word in enumerate(tokens):
            current_word = word.lower()
            if any(kw.lower() in current_word for kw in keywords):
                left = ' '.join(tokens[max(0, i-window_size):i])
                right = ' '.join(tokens[i+1:i+1+window_size])
                
                concordance['keyword'].append(word)
                concordance['left_context'].append(left)
                concordance['right_context'].append(right)
                concordance['position'].append(i)
                concordance['full_context'].append(f"{left} {word} {right}")

        return pd.DataFrame(concordance)
    
    except Exception as e:
        print(f"Error in KWIC analysis: {str(e)}")
        return pd.DataFrame()

def full_pipeline():
    """Complete automated analysis pipeline"""
    try:
        # 1. Load processed data
        with open('data/json_files/transcript_data.json') as f:
            data = json.load(f)
            summary = data.get('summary', '')  # More robust way to access summary
        
        # 2. Run auto KWIC
        kwic_df = auto_kwic_analysis(summary)
        
        if not kwic_df.empty:
            # 3. Save results
            kwic_df.to_csv('data/csv_files/kwic_results.csv', index=False)
            kwic_df.to_json('data/json_files/kwic_results.json', orient='records')
            
            # 4. Print sample
            print("Top 5 Key Contexts:")
            print(kwic_df[['keyword', 'full_context']].head().to_string(index=False))
        
        return kwic_df
    
    except Exception as e:
        print(f"Error in pipeline: {str(e)}")
        return pd.DataFrame()

# Execute the full pipeline
if __name__ == "__main__":
    final_results = full_pipeline()

Top 5 Key Contexts:
         keyword                                                                        full_context
         Mueller                                             Bert Mueller , a 35-year-old American ,
     35-year-old                  Bert Mueller , a 35-year-old American , founded California Burrito
Mexican-inspired      California Burrito , a successful Mexican-inspired restaurant chain in India .
           chain             , a successful Mexican-inspired restaurant chain in India . He overcame
           India successful Mexican-inspired restaurant chain in India . He overcame challenges like
