# Dataset Exploration

This notebook explores the dataset characteristics:
1. Introduction
2. Dataset Description
3. Number of Samples
4. Emotion Classes
5. Class Distribution
6. Text Length Analysis
7. Summary of Findings

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

DATA_DIR = Path("../data")


def load_dataset(file_name: str) -> pd.DataFrame:
    path = DATA_DIR / file_name
    return pd.read_csv(path, sep=";", header=None, names=["text", "emotion"])


train_df = load_dataset("train.txt")
test_df = load_dataset("test.txt")
full_df = pd.concat([train_df, test_df], ignore_index=True)

train_df.head()

## 1. Introduction

We will look at the train and test splits, then summarize counts, labels, and text lengths.

## 2. Dataset Description

In [None]:
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Columns: {list(train_df.columns)}")

train_df.sample(5, random_state=42)

## 3. Number of Samples

In [None]:
counts = {
    "train": len(train_df),
    "test": len(test_df),
    "total": len(full_df),
}

print(f"Train samples: {counts['train']}")
print(f"Test samples: {counts['test']}")
print(f"Total samples: {counts['total']}")

## 4. Emotion Classes

In [None]:
classes = sorted(full_df["emotion"].unique())

print("Classes:", classes)
print("Number of classes:", len(classes))

## 5. Class Distribution

In [None]:
train_class_counts = train_df["emotion"].value_counts().sort_index()
test_class_counts = test_df["emotion"].value_counts().sort_index()

print("Train distribution:\n", train_class_counts)
print("\nTest distribution:\n", test_class_counts)

fig, ax = plt.subplots(figsize=(6, 4))
train_class_counts.plot(kind="bar", ax=ax, title="Train Class Distribution")
ax.set_xlabel("Emotion")
ax.set_ylabel("Count")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()

## 6. Text Length Analysis

In [None]:
def add_length_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["char_len"] = df["text"].str.len()
    df["word_len"] = df["text"].str.split().str.len()
    return df


full_df = add_length_features(full_df)

print("Character length stats:\n", full_df["char_len"].describe())
print("\nWord length stats:\n", full_df["word_len"].describe())

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
full_df["char_len"].hist(bins=30, ax=axes[0])
axes[0].set_title("Char Length")
axes[0].set_xlabel("Characters")
axes[0].set_ylabel("Count")

full_df["word_len"].hist(bins=30, ax=axes[1])
axes[1].set_title("Word Length")
axes[1].set_xlabel("Words")
axes[1].set_ylabel("Count")

plt.tight_layout()
plt.show()

## 7. Summary of Findings

In [None]:
most_common_class = train_class_counts.idxmax()
least_common_class = train_class_counts.idxmin()

summary_lines = [
    f"Total samples: {counts['total']} (train={counts['train']}, test={counts['test']})",
    f"Number of classes: {len(classes)} -> {classes}",
    f"Most common train class: {most_common_class} ({train_class_counts[most_common_class]})",
    f"Least common train class: {least_common_class} ({train_class_counts[least_common_class]})",
    f"Median word length: {full_df['word_len'].median():.1f}",
    f"Median char length: {full_df['char_len'].median():.1f}",
]

print("Summary:")
for line in summary_lines:
    print("-", line)