# MCP Tool Categorization Analysis

이 노트북은 MCP 도구 분류 결과를 분석하고 시각화합니다.

## 분류 방법
- **Keyword**: 규칙 기반 키워드 매칭 (1,395개)
- **OpenAI (GPT-4o)**: LLM 기반 분류 (1,467개)
- **Upstage (Solar Pro 2)**: LLM 기반 분류 (1,464개)
- **Google (Gemini 2.5 Flash)**: LLM 기반 분류 (1,121개)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# 스타일 설정
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = (12, 8)

# 결과 디렉토리
RESULTS_DIR = Path('results')

## 1. 데이터 로드

In [None]:
# 결과 파일 로드
keyword_df = pd.read_csv(RESULTS_DIR / 'keyword.csv')
openai_df = pd.read_csv(RESULTS_DIR / 'llm_openai.csv')
upstage_df = pd.read_csv(RESULTS_DIR / 'llm_upstage.csv')
google_df = pd.read_csv(RESULTS_DIR / 'llm_google.csv')

# 데이터셋 정보
datasets = {
    'Keyword': keyword_df,
    'OpenAI (GPT-4o)': openai_df,
    'Upstage (Solar Pro 2)': upstage_df,
    'Google (Gemini 2.5)': google_df
}

print('=== 데이터셋 정보 ===')
for name, df in datasets.items():
    print(f'{name}: {len(df)} servers')

## 2. 대분류 (Major Category) 분석

### 2.1 대분류 분포표

In [None]:
# 대분류 분포 계산
major_distributions = {}
for name, df in datasets.items():
    dist = df['category_major'].value_counts()
    pct = (dist / len(df) * 100).round(2)
    major_distributions[name] = pd.DataFrame({
        'Count': dist,
        'Percentage': pct
    })

# 통합 테이블 생성
all_categories = sorted(set().union(*[set(d.index) for d in major_distributions.values()]))
comparison_df = pd.DataFrame(index=all_categories)

for name, dist in major_distributions.items():
    comparison_df[f'{name}_Count'] = dist['Count']
    comparison_df[f'{name}_Pct'] = dist['Percentage']

# 정수 변환
for name in datasets.keys():
    comparison_df[f'{name}_Count'] = comparison_df[f'{name}_Count'].fillna(0).astype(int)
    comparison_df[f'{name}_Pct'] = comparison_df[f'{name}_Pct'].fillna(0)

comparison_df = comparison_df.sort_values('Keyword_Count', ascending=False)

print('=== 대분류 분포 비교 ===')
display(comparison_df)

### 2.2 대분류 Bar Chart (4개 방식 비교)

In [None]:
fig, ax = plt.subplots(figsize=(18, 10))

categories = comparison_df.index.tolist()
x = np.arange(len(categories))
width = 0.2

bars1 = ax.bar(x - 1.5*width, comparison_df['Keyword_Count'], width, label='Keyword', color='#66BB6A')
bars2 = ax.bar(x - 0.5*width, comparison_df['OpenAI (GPT-4o)_Count'], width, label='OpenAI (GPT-4o)', color='#42A5F5')
bars3 = ax.bar(x + 0.5*width, comparison_df['Upstage (Solar Pro 2)_Count'], width, label='Upstage (Solar Pro 2)', color='#FF7043')
bars4 = ax.bar(x + 1.5*width, comparison_df['Google (Gemini 2.5)_Count'], width, label='Google (Gemini 2.5)', color='#AB47BC')

ax.set_xlabel('Category', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('MCP Tool Classification: Major Category Comparison (4 Methods)', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(categories, rotation=45, ha='right', fontsize=10)
ax.legend(loc='upper right', fontsize=10)

# 값 레이블 추가
for bars in [bars1, bars2, bars3, bars4]:
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax.annotate(f'{int(height)}', xy=(bar.get_x() + bar.get_width()/2, height),
                        xytext=(0, 2), textcoords='offset points', ha='center', va='bottom', fontsize=6)

plt.tight_layout()
plt.show()

### 2.3 대분류 Pie Chart (4개 방식)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
axes = axes.flatten()

colors = plt.cm.Set3(np.linspace(0, 1, 14))

for ax, (name, df) in zip(axes, datasets.items()):
    dist = df['category_major'].value_counts()
    wedges, texts, autotexts = ax.pie(
        dist.values, 
        labels=dist.index, 
        autopct='%1.1f%%',
        colors=colors[:len(dist)], 
        startangle=90,
        pctdistance=0.75
    )
    ax.set_title(f'{name}\n({len(df)} servers)', fontsize=12, fontweight='bold')
    
    # 5% 미만은 레이블 숨김
    for autotext, val in zip(autotexts, dist.values):
        if val / dist.sum() < 0.05:
            autotext.set_text('')
        else:
            autotext.set_fontsize(8)
    for text in texts:
        text.set_fontsize(8)

plt.suptitle('MCP Tool Classification: Major Category Distribution', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

### 2.4 개별 대분류 Pie Chart

In [None]:
for name, df in datasets.items():
    fig, ax = plt.subplots(figsize=(10, 8))
    
    dist = df['category_major'].value_counts()
    colors = plt.cm.Set3(np.linspace(0, 1, len(dist)))
    
    wedges, texts, autotexts = ax.pie(
        dist.values, 
        labels=dist.index, 
        autopct='%1.1f%%',
        colors=colors, 
        startangle=90,
        pctdistance=0.8
    )
    
    ax.set_title(f'{name} - Major Category Distribution\n({len(df)} servers)', fontsize=14, fontweight='bold')
    
    for autotext in autotexts:
        autotext.set_fontsize(9)
    for text in texts:
        text.set_fontsize(9)
    
    plt.tight_layout()
    plt.show()

## 3. 소분류 (Minor Category) 분석

### 3.1 소분류 분포표

In [None]:
# 소분류 분포 계산
for name, df in datasets.items():
    print(f'\n=== {name} - Minor Category Distribution ===')
    dist = df['category_minor'].value_counts()
    pct = (dist / len(df) * 100).round(2)
    minor_df = pd.DataFrame({
        'Count': dist,
        'Percentage': pct
    })
    display(minor_df)

### 3.2 소분류 Bar Chart (4개 방식 비교)

In [None]:
# 모든 소분류 합집합
all_minor_cats = sorted(set().union(*[set(df['category_minor'].unique()) for df in datasets.values()]))

fig, ax = plt.subplots(figsize=(20, 18))

y = np.arange(len(all_minor_cats))
height = 0.2

keyword_vals = [keyword_df['category_minor'].value_counts().get(c, 0) for c in all_minor_cats]
openai_vals = [openai_df['category_minor'].value_counts().get(c, 0) for c in all_minor_cats]
upstage_vals = [upstage_df['category_minor'].value_counts().get(c, 0) for c in all_minor_cats]
google_vals = [google_df['category_minor'].value_counts().get(c, 0) for c in all_minor_cats]

bars1 = ax.barh(y - 1.5*height, keyword_vals, height, label='Keyword', color='#66BB6A')
bars2 = ax.barh(y - 0.5*height, openai_vals, height, label='OpenAI (GPT-4o)', color='#42A5F5')
bars3 = ax.barh(y + 0.5*height, upstage_vals, height, label='Upstage (Solar Pro 2)', color='#FF7043')
bars4 = ax.barh(y + 1.5*height, google_vals, height, label='Google (Gemini 2.5)', color='#AB47BC')

ax.set_yticks(y)
ax.set_yticklabels(all_minor_cats, fontsize=8)
ax.set_xlabel('Count', fontsize=12)
ax.set_title(f'All Minor Categories Comparison ({len(all_minor_cats)} categories)', fontsize=14, fontweight='bold')
ax.legend(loc='lower right', fontsize=10)

plt.tight_layout()
plt.show()

### 3.3 소분류 Pie Chart (4개 방식)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(24, 20))
axes = axes.flatten()

for ax, (name, df) in zip(axes, datasets.items()):
    dist = df['category_minor'].value_counts()
    
    # 상위 15개만 표시, 나머지는 Others로
    top_n = 15
    if len(dist) > top_n:
        top_dist = dist.head(top_n)
        others = dist.iloc[top_n:].sum()
        top_dist['Others'] = others
        dist = top_dist
    
    colors = plt.cm.tab20(np.linspace(0, 1, len(dist)))
    
    wedges, texts, autotexts = ax.pie(
        dist.values, 
        labels=dist.index, 
        autopct='%1.1f%%',
        colors=colors, 
        startangle=90,
        pctdistance=0.8
    )
    ax.set_title(f'{name}\nMinor Categories (Top {top_n})', fontsize=12, fontweight='bold')
    
    for autotext in autotexts:
        autotext.set_fontsize(7)
    for text in texts:
        text.set_fontsize(7)

plt.suptitle('MCP Tool Classification: Minor Category Distribution', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

### 3.4 개별 소분류 Bar Chart

In [None]:
color_schemes = {
    'Keyword': 'viridis', 
    'OpenAI (GPT-4o)': 'plasma', 
    'Upstage (Solar Pro 2)': 'inferno',
    'Google (Gemini 2.5)': 'cividis'
}

for name, df in datasets.items():
    fig, ax = plt.subplots(figsize=(14, 16))
    
    minor_dist = df['category_minor'].value_counts()
    colors = plt.colormaps[color_schemes[name]](np.linspace(0.1, 0.9, len(minor_dist)))
    
    y_pos = np.arange(len(minor_dist))
    bars = ax.barh(y_pos, minor_dist.values, color=colors)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(minor_dist.index, fontsize=8)
    ax.invert_yaxis()
    ax.set_xlabel('Count', fontsize=12)
    ax.set_title(f'{name} - All Minor Categories ({len(minor_dist)} categories)', fontsize=14, fontweight='bold')
    
    # 값 레이블
    for bar, val in zip(bars, minor_dist.values):
        ax.annotate(f'{val}', xy=(val, bar.get_y() + bar.get_height()/2),
                    xytext=(3, 0), textcoords='offset points', ha='left', va='center', fontsize=7)
    
    plt.tight_layout()
    plt.show()

## 4. 비율 분석

### 4.1 대분류 비율표

In [None]:
print('=== 대분류 비율 비교 (%) ===')

# 대분류 비율 테이블
major_pct_df = pd.DataFrame()
for name, df in datasets.items():
    dist = df['category_major'].value_counts()
    pct = (dist / len(df) * 100).round(2)
    major_pct_df[name] = pct

major_pct_df = major_pct_df.fillna(0).sort_values(list(datasets.keys())[0], ascending=False)
display(major_pct_df)

### 4.2 소분류 비율표 (Top 30)

In [None]:
print('=== 소분류 비율 비교 (Top 30) ===')

# 소분류 비율 테이블
minor_pct_df = pd.DataFrame()
for name, df in datasets.items():
    dist = df['category_minor'].value_counts()
    pct = (dist / len(df) * 100).round(2)
    minor_pct_df[name] = pct

minor_pct_df = minor_pct_df.fillna(0)
# 첫 번째 컬럼 기준 정렬
minor_pct_df = minor_pct_df.sort_values(list(datasets.keys())[0], ascending=False)
display(minor_pct_df.head(30))

### 4.3 비율 Heatmap

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))

# 대분류 비율 히트맵
heatmap_data = major_pct_df.values
im = ax.imshow(heatmap_data, cmap='YlOrRd', aspect='auto')

# 축 설정
ax.set_xticks(np.arange(len(major_pct_df.columns)))
ax.set_yticks(np.arange(len(major_pct_df.index)))
ax.set_xticklabels(major_pct_df.columns, fontsize=9, rotation=30, ha='right')
ax.set_yticklabels(major_pct_df.index, fontsize=10)

# 값 표시
for i in range(len(major_pct_df.index)):
    for j in range(len(major_pct_df.columns)):
        val = heatmap_data[i, j]
        color = 'white' if val > heatmap_data.max() / 2 else 'black'
        ax.text(j, i, f'{val:.1f}%', ha='center', va='center', color=color, fontsize=8)

ax.set_title('Major Category Distribution Heatmap (%)', fontsize=14, fontweight='bold')
plt.colorbar(im, ax=ax, label='Percentage')
plt.tight_layout()
plt.show()

## 5. 결과 저장

In [None]:
# 비율 테이블 CSV 저장
major_pct_df.to_csv(RESULTS_DIR / 'major_category_percentage.csv')
minor_pct_df.to_csv(RESULTS_DIR / 'minor_category_percentage.csv')

print('저장된 파일:')
print('  - major_category_percentage.csv')
print('  - minor_category_percentage.csv')

## 6. 연결 타입 (Connection Types) 분석

In [None]:
# 연결 타입 데이터 로드
conn_df = pd.read_csv(RESULTS_DIR / 'connection_types.csv')

print('=== 연결 타입 분포 (전체) ===')
print(conn_df['connection_category'].value_counts())
print()

# Smithery 소스만 (연결 정보가 있는 서버)
smithery_conn = conn_df[conn_df['source'] == 'smithery']
print(f'=== 연결 타입 분포 (Smithery Only: {len(smithery_conn)} servers) ===')
print(smithery_conn['connection_category'].value_counts())
print()

# 비율 계산
print('=== 연결 타입 비율 (Smithery Only) ===')
for cat in ['stdio', 'http', 'both']:
    count = len(smithery_conn[smithery_conn['connection_category'] == cat])
    pct = count / len(smithery_conn) * 100
    print(f'{cat}: {count} ({pct:.1f}%)')