# CLINC150 â€” Exploratory Data Analysis


In [None]:
import sys
sys.path.insert(0, '../src')

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from dataset import load_clinc150

splits, label2id = load_clinc150()
id2label = {v: k for k, v in label2id.items()}
print(f'Number of intent classes: {len(label2id)}')

In [None]:
# Split statistics
for split, samples in splits.items():
    in_domain = [s for s in samples if not s[2]]
    ood = [s for s in samples if s[2]]
    print(f'{split:5s}: {len(in_domain):5d} in-domain | {len(ood):4d} OOD')

In [None]:
# Class distribution in training set
train_labels = [label for _, label, is_ood in splits['train'] if not is_ood]
label_counts = Counter(train_labels)

print(f'Min samples per class: {min(label_counts.values())}')
print(f'Max samples per class: {max(label_counts.values())}')
print(f'Mean samples per class: {sum(label_counts.values())/len(label_counts):.1f}')

In [None]:
# Utterance length distribution
all_texts = [text for text, _, _ in splits['train']]
lengths = [len(text.split()) for text in all_texts]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(lengths, bins=30, color='steelblue', edgecolor='white')
axes[0].set_xlabel('Word count')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Utterance Length Distribution (Train)')

# OOD vs in-domain length comparison
in_lengths = [len(t.split()) for t, _, is_ood in splits['test'] if not is_ood]
ood_lengths = [len(t.split()) for t, _, is_ood in splits['test'] if is_ood]
axes[1].hist(in_lengths, bins=25, alpha=0.6, label='In-domain', color='steelblue')
axes[1].hist(ood_lengths, bins=25, alpha=0.6, label='OOD', color='tomato')
axes[1].set_xlabel('Word count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Utterance Length: In-domain vs OOD (Test)')
axes[1].legend()

plt.tight_layout()
plt.savefig('../report/eda_lengths.pdf', bbox_inches='tight')
plt.show()

In [None]:
# Sample OOD utterances
print('Sample OOD utterances from test set:')
for text, _, is_ood in splits['test']:
    if is_ood:
        print(f'  - {text}')
    if sum(1 for _, _, o in splits['test'][:splits['test'].index((text, _, is_ood))+1] if o) >= 10:
        break