In [1]:
"""
01_explore_raw_data.py

Exploratory notebook (script form) for:
- Inspecting clean_Rigveda / clean_Yajurveda / clean_Samveda CSVs
- Understanding Chanda distributions
- Sanity-checking key columns and potential anomalies

Run from project root with:
    python -m notebooks.01_explore_raw_data
or open as a notebook and run cell-by-cell.
"""

'\n01_explore_raw_data.py\n\nExploratory notebook (script form) for:\n- Inspecting clean_Rigveda / clean_Yajurveda / clean_Samveda CSVs\n- Understanding Chanda distributions\n- Sanity-checking key columns and potential anomalies\n\nRun from project root with:\n    python -m notebooks.01_explore_raw_data\nor open as a notebook and run cell-by-cell.\n'

In [6]:
import os

import pandas as pd

# 1. Get the current directory (which is 'notebooks')
NOTEBOOK_DIR = os.getcwd() 

# 2. Go up one level to the project root ('vedic-chandas')
BASE_DIR = os.path.dirname(NOTEBOOK_DIR)

# 3. Construct the path to the 'data/raw' folder inside the root
DATA_DIR = os.path.join(BASE_DIR, "data", "raw")

# 4. Define the full paths to your files
rig_path = os.path.join(DATA_DIR, "clean_Rigveda.csv")
yaj_path = os.path.join(DATA_DIR, "clean_Yajurveda.csv")
sam_path = os.path.join(DATA_DIR, "clean_Samveda.csv")

In [7]:
print("Rigveda path:", rig_path)
print("Yajurveda path:", yaj_path)
print("Samaveda path:", sam_path)

Rigveda path: E:\After School Years\Hackathons\SIH 2025\final-mvp\vedic-chandas\data\raw\clean_Rigveda.csv
Yajurveda path: E:\After School Years\Hackathons\SIH 2025\final-mvp\vedic-chandas\data\raw\clean_Yajurveda.csv
Samaveda path: E:\After School Years\Hackathons\SIH 2025\final-mvp\vedic-chandas\data\raw\clean_Samaveda.csv


In [8]:
rig = pd.read_csv(rig_path)
yaj = pd.read_csv(yaj_path)
sam = pd.read_csv(sam_path)

FileNotFoundError: [Errno 2] No such file or directory: 'E:\\After School Years\\Hackathons\\SIH 2025\\final-mvp\\vedic-chandas\\data\\raw\\clean_Rigveda.csv'

In [None]:
print("=== Rigveda head ===")
display(rig.head())  # in script mode you can use print(rig.head())
print("=== Yajurveda head ===")
display(yaj.head())
print("=== Samaveda head ===")
display(sam.head())

In [None]:
print("=== Rigveda columns ===")
print(rig.columns.tolist())
print("=== Yajurveda columns ===")
print(yaj.columns.tolist())
print("=== Samaveda columns ===")
print(sam.columns.tolist())

In [None]:
print("Rigveda row count:", len(rig))
print("Yajurveda row count:", len(yaj))
print("Samaveda row count:", len(sam))

In [None]:
print("Rigveda row count:", len(rig))
print("Yajurveda row count:", len(yaj))
print("Samaveda row count:", len(sam))

In [None]:
# Look at some heterometric / comma-separated Chanda labels
print("=== Example heterometric chanda labels (Rigveda) ===")
print(rig[rig["Chanda"].astype(str).str.contains(",")]["Chanda"].head(20))

print("=== Example heterometric chanda labels (Yajurveda) ===")
print(yaj[yaj["Chanda"].astype(str).str.contains(",")]["Chanda"].head(20))

print("=== Example heterometric chanda labels (Samaveda) ===")
print(sam[sam["Chanda"].astype(str).str.contains(",")]["Chanda"].head(20))

In [None]:
# Check presence of Padpath / Transliteration columns
print("Rigveda Padpath non-null:", rig["Padpath"].notna().mean())
print("Rigveda Transliteration non-null:", rig["Transliteration"].notna().mean())
print("Yajurveda Padpath non-null:", yaj["Padpath"].notna().mean())
print("Samaveda Padpath non-null:", sam["Padpath"].notna().mean())

In [None]:
# Inspect some famous mantras by ID to ensure they look right.
# Example: RV 1.1.1
rv_111 = rig[
    (rig["Mandal"] == 1) & (rig["Sukta"] == 1) & (rig["Mantra Number"] == 1)
]
print("=== RV 1.1.1 row ===")
display(rv_111)

In [None]:
# Save small debug subsets for prototyping and tests if desired
OUT_DIR = os.path.join(BASE_DIR, "data", "interim")
os.makedirs(OUT_DIR, exist_ok=True)

rig.head(50).to_csv(os.path.join(OUT_DIR, "rigveda_sample_50.csv"), index=False)
yaj.head(50).to_csv(os.path.join(OUT_DIR, "yajurveda_sample_50.csv"), index=False)
sam.head(50).to_csv(os.path.join(OUT_DIR, "samaveda_sample_50.csv"), index=False)

print("Wrote sample CSVs to", OUT_DIR)