# Data Exploration

This notebook provides an initial exploratory analysis of the available CSV datasets located in the `Data/` directory.

We will: 
1. Inspect directory structure and file sizes.
2. Load training / trial / test splits for `comments`, `task1`, and `task2`.
3. Show basic schema (columns, dtypes), head/tail samples, and descriptive statistics.
4. Check missing values.
5. (Placeholder) Outline next analytical steps.

> You can adapt and extend this notebook for modeling, feature engineering, or reporting.

In [7]:
# Imports
import os, json, math, textwrap
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display  # Added to fix NameError for display()

# Display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 120)
sns.set_theme(context='notebook', style='whitegrid')

DATA_ROOT = Path('Data')  # Adjust if the relative path changes
SPLITS = ['training data', 'trial data', 'test data']
FILES = ['comments.csv', 'task1.csv', 'task2.csv']

def file_summary(path: Path) -> dict:
    return {
        'path': str(path),
        'exists': path.exists(),
        'size_kb': round(path.stat().st_size / 1024, 2) if path.exists() else None
    }

summaries = []
for split in SPLITS:
    for fname in FILES:
        summaries.append(file_summary(DATA_ROOT / split / fname))
pd.DataFrame(summaries)

Unnamed: 0,path,exists,size_kb
0,Data/training data/comments.csv,False,
1,Data/training data/task1.csv,False,
2,Data/training data/task2.csv,False,
3,Data/trial data/comments.csv,False,
4,Data/trial data/task1.csv,False,
5,Data/trial data/task2.csv,False,
6,Data/test data/comments.csv,False,
7,Data/test data/task1.csv,False,
8,Data/test data/task2.csv,False,


## Helper Functions

In [8]:
def load_split(split: str) -> dict:
    """Load all CSVs for a split; returns dict of DataFrames.
    split: one of SPLITS
    """
    dfs = {}
    for fname in FILES:
        path = DATA_ROOT / split / fname
        if not path.exists():
            print(f'WARNING: Missing {path}')
            continue
        try:
            df = pd.read_csv(path)
        except Exception as e:
            print(f'ERROR reading {path}: {e}')
            continue
        dfs[fname.replace('.csv','')] = df
    return dfs

def quick_profile(df: pd.DataFrame, name: str):
    print(f'--- {name} ---')
    print('Shape:', df.shape)
    print('Columns:', list(df.columns))
    display(df.head(3))
    display(df.tail(3))
    miss = df.isna().sum()
    miss = miss[miss>0]
    if not miss.empty:
        print('Missing values:')
        display(miss.to_frame('missing'))
    else:
        print('No missing values.')
    print()

def describe_numeric(df: pd.DataFrame):
    num = df.select_dtypes(include=[np.number])
    if num.empty:
        print('No numeric columns.')
    else:
        display(num.describe().T)

def plot_numeric_distributions(df: pd.DataFrame, max_cols: int = 10):
    num = df.select_dtypes(include=[np.number])
    if num.empty:
        return
    cols = num.columns[:max_cols]
    num[cols].hist(figsize=(min(3*len(cols), 18), 4), bins=30)
    plt.tight_layout()

print('Helper functions ready.')

Helper functions ready.


## Load All Splits

In [10]:
data = {split: load_split(split) for split in SPLITS}
# Show keys structure
{k: list(v.keys()) for k,v in data.items()}



{'training data': [], 'trial data': [], 'test data': []}

## Quick Profiles (Training Split)

In [None]:
train = data.get('training data', {})
for name, df in train.items():
    quick_profile(df, f'training/{name}')
    describe_numeric(df)
    plot_numeric_distributions(df)
plt.show()

## Compare Row Counts Across Splits

In [None]:
rows = []
for split, dfs in data.items():
    for name, df in dfs.items():
        rows.append({'split': split, 'dataset': name, 'rows': len(df)})
row_counts = pd.DataFrame(rows)
row_counts_pivot = row_counts.pivot(index='dataset', columns='split', values='rows')
display(row_counts_pivot)
sns.barplot(data=row_counts, x='dataset', y='rows', hue='split')
plt.title('Row Counts by Split')
plt.tight_layout()

## Next Steps
- Add domain-specific feature engineering.
- Target variable exploration (if applicable).
- Text preprocessing for comments (tokenization, normalization).
- Train/validation split inside training data if needed.
- Baseline model experiments.

Feel free to insert new cells below this point to continue your analysis.

In [None]:
# Environment/version check
import sys, importlib
mods = ['pandas','numpy','matplotlib','seaborn','IPython']
versions = {m: importlib.import_module(m).__version__ if hasattr(importlib.import_module(m),'__version__') else 'n/a' for m in mods}
print('Python', sys.version)
print('Library versions:')
for k,v in versions.items():
    print(f'  {k}: {v}')