In [1]:
import zipfile
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

OUT = Path("plots")
OUT.mkdir(exist_ok=True)

train_path = Path("QEvasion/data/train-00000-of-00001.parquet")

if not train_path.exists():
    raise FileNotFoundError(f"File not found: {train_path}")

train = pd.read_parquet(train_path)


# Rename label column if required
if 'label' in train.columns and 'clarity_label' not in train.columns:
    train = train.rename(columns={'label': 'clarity_label'})

In [2]:
# clarity distribution
clarity_counts = train['clarity_label'].fillna("NULL").value_counts()
ax = clarity_counts.plot.bar(figsize=(7,4))
ax.set_title("Train: Clarity label distribution")
ax.set_ylabel("Count")
plt.tight_layout()
plt.savefig(OUT / "train_clarity_distribution.pdf")
plt.close()


In [3]:
# evasion distribution (if present)
if 'evasion_label' in train.columns:
    ev_counts = train['evasion_label'].fillna("NULL").value_counts()
    ev_counts.plot.bar(figsize=(9,4)).set_title("Train: Evasion distribution")
    plt.tight_layout()
    plt.savefig(OUT / "train_evasion_distribution.pdf")
    plt.close()
