# Interview Question Analysis

Analyze common interview questions from extracted interview data.

In [None]:
import json
import re
from collections import Counter
from pathlib import Path

# Load the extracted interviews data
DATA_PATH = "../output_20260127_014850/extracted_interviews_20260127_033819.json"

with open(DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Total groups: {len(data.get('extractions', []))}")
print(f"Summary: {data.get('summary', {})}")

## 1. Extract All Question Topics from cross_post_insights

In [None]:
# Collect all 相关题目汇总 entries
all_topics = []
all_observations = []

for group in data.get('extractions', []):
    insights = group.get('cross_post_insights', {})
    topics = insights.get('相关题目汇总', [])
    observation = insights.get('综合观察', '')
    
    all_topics.extend(topics)
    if observation:
        all_observations.append(observation)

print(f"Total topic entries: {len(all_topics)}")
print(f"Total observations: {len(all_observations)}")

## 2. Clean and Normalize Topic Names

In [None]:
def clean_topic(topic: str) -> str:
    """Clean and normalize a topic string."""
    # Remove post references like （帖子3、4）
    cleaned = re.sub(r'[（(][^）)]*帖子[^）)]*[）)]', '', topic)
    # Remove trailing/leading whitespace
    cleaned = cleaned.strip()
    # Remove trailing punctuation
    cleaned = cleaned.rstrip('：:')
    return cleaned

# Clean all topics
cleaned_topics = [clean_topic(t) for t in all_topics if clean_topic(t)]
print(f"Cleaned topics: {len(cleaned_topics)}")
print("\nSample cleaned topics:")
for t in cleaned_topics[:10]:
    print(f"  - {t}")

## 3. Count Topic Frequency

In [None]:
# Count exact matches
topic_counter = Counter(cleaned_topics)

print("Top 30 Most Common Topics (exact match):")
print("=" * 60)
for topic, count in topic_counter.most_common(30):
    print(f"{count:3d} | {topic}")

## 4. Group Similar Topics by Keywords

In [None]:
# Define keyword categories
KEYWORD_CATEGORIES = {
    "System Design": ["system design", "系统设计", "SD", "Inference API", "distributed", "分布式"],
    "In-memory DB/Cache": ["in-memory", "cache", "key-value", "KV", "DB"],
    "LeetCode/Coding": ["leetcode", "coding", "算法", "backtrack", "dp", "bfs", "dfs"],
    "ML/AI": ["ml", "machine learning", "rl", "reinforcement", "transformer", "attention", "llm"],
    "Web/API": ["web", "api", "crawler", "http", "rest"],
    "Take-home": ["take-home", "take home", "assignment"],
    "BQ/Behavioral": ["bq", "behavioral", "行为", "culture"],
}

def categorize_topic(topic: str) -> list:
    """Return all matching categories for a topic."""
    topic_lower = topic.lower()
    categories = []
    for category, keywords in KEYWORD_CATEGORIES.items():
        for kw in keywords:
            if kw.lower() in topic_lower:
                categories.append(category)
                break
    return categories if categories else ["Other"]

# Categorize all topics
category_counter = Counter()
category_topics = {}

for topic in cleaned_topics:
    cats = categorize_topic(topic)
    for cat in cats:
        category_counter[cat] += 1
        if cat not in category_topics:
            category_topics[cat] = []
        category_topics[cat].append(topic)

print("Topics by Category:")
print("=" * 60)
for cat, count in category_counter.most_common():
    print(f"{count:3d} | {cat}")

## 5. Extract Question Descriptions from Individual Posts

In [None]:
# Extract interview_info from all posts
all_questions = []

for group in data.get('extractions', []):
    for post in group.get('posts', []):
        interview_info = post.get('interview_info', {})
        stage = post.get('interview_stage', '未知')
        
        # Skip filtered stages
        if stage in ['未知', 'N/A', '无有效信息']:
            continue
        
        question_type = interview_info.get('题目类型', '')
        description = interview_info.get('题目描述', '')
        requirements = interview_info.get('具体要求', [])
        focus_areas = interview_info.get('考察重点', [])
        
        if description and description != '无有效信息' and description != 'N/A':
            all_questions.append({
                'stage': stage,
                'type': question_type,
                'description': description,
                'requirements': requirements,
                'focus_areas': focus_areas,
                'url': post.get('source_url', ''),
            })

print(f"Total valid question descriptions: {len(all_questions)}")

In [None]:
# Count by question type
type_counter = Counter(q['type'] for q in all_questions)

print("Questions by Type:")
print("=" * 60)
for qtype, count in type_counter.most_common():
    print(f"{count:3d} | {qtype}")

In [None]:
# Count by interview stage
stage_counter = Counter(q['stage'] for q in all_questions)

print("Questions by Interview Stage:")
print("=" * 60)
for stage, count in stage_counter.most_common(20):
    print(f"{count:3d} | {stage}")

## 6. Find Common Question Patterns

In [None]:
# Extract key patterns from descriptions
COMMON_PATTERNS = [
    (r'in-memory|key.?value|cache|db', 'In-Memory DB/Cache'),
    (r'system design|sd|设计', 'System Design'),
    (r'inference|api|middleware', 'Inference/API Design'),
    (r'leetcode|lc|力扣', 'LeetCode'),
    (r'crawler|爬虫', 'Web Crawler'),
    (r'ml|machine learning|模型', 'ML/AI'),
    (r'bq|behavioral|行为', 'Behavioral'),
    (r'coding|code', 'Coding'),
    (r'distributed|分布式', 'Distributed Systems'),
    (r'chat|chatbox|聊天', 'Chat Service'),
]

pattern_counter = Counter()
pattern_examples = {}

for q in all_questions:
    desc = q['description'].lower()
    matched = False
    for pattern, name in COMMON_PATTERNS:
        if re.search(pattern, desc, re.IGNORECASE):
            pattern_counter[name] += 1
            if name not in pattern_examples:
                pattern_examples[name] = []
            if len(pattern_examples[name]) < 3:
                pattern_examples[name].append(q['description'][:100])
            matched = True
            break
    if not matched:
        pattern_counter['Other'] += 1

print("Common Question Patterns:")
print("=" * 60)
for pattern, count in pattern_counter.most_common():
    print(f"{count:3d} | {pattern}")

In [None]:
# Show examples for each pattern
print("Examples by Pattern:")
print("=" * 80)
for pattern, examples in pattern_examples.items():
    print(f"\n### {pattern}")
    for i, ex in enumerate(examples, 1):
        print(f"  {i}. {ex}...")

## 7. Detailed OA Questions Analysis

In [None]:
# Filter OA-stage questions
oa_questions = [q for q in all_questions if 'OA' in q['stage'].upper()]

print(f"Total OA questions: {len(oa_questions)}")
print("\nOA Question Descriptions:")
print("=" * 80)
for i, q in enumerate(oa_questions[:20], 1):
    print(f"\n{i}. [{q['type']}] {q['description'][:200]}")
    if q['requirements']:
        print(f"   Requirements: {', '.join(q['requirements'][:3])}")
    if q['focus_areas']:
        print(f"   Focus: {', '.join(q['focus_areas'][:3])}")

## 8. Export Summary

In [None]:
# Create summary report
summary = {
    "total_topics": len(cleaned_topics),
    "total_questions": len(all_questions),
    "top_topics": topic_counter.most_common(20),
    "topics_by_category": dict(category_counter),
    "questions_by_type": dict(type_counter),
    "questions_by_stage": dict(stage_counter.most_common(15)),
    "pattern_counts": dict(pattern_counter),
}

# Save summary
output_path = "interview_question_analysis_summary.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print(f"Summary saved to: {output_path}")
print("\n" + "=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)