In [20]:
# Imports and versions
import os, sys, platform, subprocess
from pathlib import Path
import pandas as pd
import plotly
import plotly.express as px

print('Python:', sys.version.splitlines()[0])
print('Pandas:', pd.__version__)
print('Plotly:', plotly.__version__)

Python: 3.12.11 (main, Jul  1 2025, 05:29:09) [GCC 12.2.0]
Pandas: 2.3.3
Plotly: 6.5.0


In [21]:
# Paths to key CSVs (using absolute paths from repo root)
from pathlib import Path
REPO_ROOT = Path('/workspaces/WjecDocumentScraper')
DATA_FILES = REPO_ROOT / 'notebooks/cleansed_data/document_stats-files.csv'
DATA_LLM_PROOFREADER = REPO_ROOT / 'notebooks/cleansed_data/llm_proofreader_cleansed_data.csv'
DATA_CATEGORISED = REPO_ROOT / 'notebooks/cleansed_data/language_check_categorised.csv'

# Load (these are small enough to load into memory in this repo)
files = pd.read_csv(DATA_FILES)
#lang = pd.read_csv(DATA_LANG)
llm_proofreader = pd.read_csv(DATA_LLM_PROOFREADER)

files.shape, llm_proofreader.shape

((239, 3), (15673, 11))

In [22]:
# Light-mode guard for the categorised dataset
LIGHT_MODE = os.getenv('DOCS_LIGHT', '0') == '1'
categorised_available = DATA_CATEGORISED.exists()

if LIGHT_MODE and not categorised_available:
    print('DOCS_LIGHT=1 and language_check_categorised.csv not found; skipping categorised analyses.')
elif not categorised_available:
    print(f'language_check_categorised.csv not found at {DATA_CATEGORISED}; add it to enable categorised analyses.')
else:
    print(f'Found categorised dataset at {DATA_CATEGORISED}')

Found categorised dataset at /workspaces/WjecDocumentScraper/notebooks/cleansed_data/language_check_categorised.csv


In [23]:
# Quick heads and basic stats
display(docs.head())
display(docs.describe(include='all'))

display(files.head())
display(files['Pages'].describe())

display(lang.head())
print('Language-check issues by type:')
print(lang['Type'].value_counts().head(10))

NameError: name 'docs' is not defined

In [None]:
# Example plot: PDFs per subject (from document_stats.csv)
fig = px.bar(docs, x='Subject', y='PDFs', title='PDFs per subject')
fig.update_layout(xaxis={'categoryorder':'total descending'}, height=500)
fig.show()

In [None]:
# Distribution of pages per file (document_stats-files.csv)
fig = px.histogram(files, x='Pages', nbins=30, title='Distribution of pages per file')
fig.show()

In [None]:
# Join example: verify per-subject page totals
files_by_subject = files.groupby('Subject', as_index=False)['Pages'].sum().rename(columns={'Pages':'Pages_files'})
merged = pd.merge(docs, files_by_subject, on='Subject', how='left')
merged['Pages_diff'] = merged['Pages'] - merged['Pages_files']
display(merged[['Subject','Pages','Pages_files','Pages_diff']].sort_values('Pages_diff', key=abs, ascending=False).head(15))

In [None]:
# Top subjects by language-check issues (quick summary)
top_issues = lang.groupby('Subject').size().reset_index(name='n').sort_values('n', ascending=False).head(25)
fig = px.bar(top_issues, x='Subject', y='n', title='Top subjects by language-check issues')
fig.update_layout(xaxis={'categoryorder':'total descending'}, height=500)
fig.show()

In [None]:
# Save a small processed summary for quick reference
import os
os.makedirs('notebooks', exist_ok=True)
merged.to_csv('notebooks/processed_summary.csv', index=False)
print('Wrote notebooks/processed_summary.csv')

In [None]:
# Reproducibility: git commit and environment
try:
    sha = subprocess.check_output(['git','rev-parse','--short','HEAD']).decode().strip()
except Exception:
    sha = '<not available>'
print('Git commit:', sha)
print('Platform:', platform.platform())

## Next steps

- Add categorised dataset analyses when language_check_categorised.csv is available.
- Add more focused visualisations (issue types over time, heatmaps by rule ID, per-file issue density).
- Keep CI notebook execution and MkDocs publishing green as new notebooks are added.