# 01 - Exploratory Data Analysis

This notebook explores the Twitter emotion dataset to understand:
- Class distribution
- Text characteristics
- Data quality issues

In [1]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

from src.config import DATASET_NAME, EMOTION_LABELS

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [None]:
dataset = load_dataset(DATASET_NAME, trust_remote_code=True)
print(f"Dataset splits: {dataset}")

In [None]:
# Combine all splits for analysis
X_all = np.concatenate([
    dataset['train'][:]['text'],
    dataset['validation'][:]['text'],
    dataset['test'][:]['text']
])
y_all = np.concatenate([
    dataset['train'][:]['label'],
    dataset['validation'][:]['label'],
    dataset['test'][:]['label']
])

df = pd.DataFrame({'text': X_all, 'label': y_all})
df['emotion'] = df['label'].map(EMOTION_LABELS)
print(f"Total samples: {len(df)}")
df.head()

## Class Distribution

In [None]:
class_counts = df['emotion'].value_counts()
print("Class Distribution:")
print(class_counts)

plt.figure(figsize=(10, 6))
class_counts.plot(kind='bar', color='steelblue')
plt.title('Emotion Class Distribution')
plt.xlabel('Emotion')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Text Length Analysis

In [None]:
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("Text Length Statistics:")
print(df[['text_length', 'word_count']].describe())

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['text_length'], bins=50, color='steelblue', edgecolor='black')
axes[0].set_title('Distribution of Text Length (characters)')
axes[0].set_xlabel('Character Count')
axes[0].set_ylabel('Frequency')

axes[1].hist(df['word_count'], bins=50, color='coral', edgecolor='black')
axes[1].set_title('Distribution of Word Count')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Sample Texts by Emotion

In [None]:
for emotion in EMOTION_LABELS.values():
    print(f"\n=== {emotion.upper()} ===")
    samples = df[df['emotion'] == emotion]['text'].sample(3, random_state=42)
    for i, text in enumerate(samples, 1):
        print(f"{i}. {text}")