# Stats Scratchpad

This notebook is where I explore the data before putting it into another notebook for public consumption.

In [15]:
# Shared setup
%run _shared/setup_data_overview.py

In [None]:
# Query: output every deduplicated "Spelling" issue
cols_lower = {c.lower(): c for c in issues.columns}
if 'category' in cols_lower:
    cat_col = cols_lower['category']
elif 'issue category' in cols_lower:
    cat_col = cols_lower['issue category']
else:
    raise KeyError(f"No category-like column found. Columns: {issues.columns.tolist()}")

# Select spelling rows and make a copy
spelling = issues[issues[cat_col].astype(str).str.casefold() == 'spelling'].copy()

# Deduplicate by Issue token (case-insensitive, whitespace-normalised)
issue_col = cols_lower.get('issue')
if issue_col:
    spelling['_issue_norm'] = (
        spelling[issue_col].astype(str).str.casefold().str.strip()
    )
    spelling = spelling.drop_duplicates(subset=['_issue_norm']).reset_index(drop=True)
    spelling = spelling.drop(columns=['_issue_norm'])
else:
    # fallback to full-row deduplication if no Issue column exists
    spelling = spelling.drop_duplicates().reset_index(drop=True)

display(spelling)

Unnamed: 0,Subject,File Name,Document Name,Issue ID,Page Number,Issue,Context,Pass Code,Issue Category,Confidence Score,Reasoning
0,Art and Design,gcse-art-and-design---guidance-for-teaching.csv,Guidance For Teaching,18,29,Paul Gaugin,...riate way). #### Examples could be: - **P...,LTC,Spelling,100,The artist's surname is misspelled; it should ...
1,Art and Design,gcse-art-and-design---guidance-for-teaching.csv,Guidance For Teaching,32,39,Womens,...elationships for young people.<br>Welsh **W...,LTC,Spelling,100,The proper noun 'Welsh Women's Aid' requires a...
2,Art and Design,gcse-art-and-design---guidance-for-teaching.csv,Guidance For Teaching,57,50,tryptichs,...of stories from the Bible (sometimes as **t...,LTC,Spelling,100,The correct spelling of the word is 'triptychs...
3,Art and Design,gcse-art-and-design---guidance-for-teaching.csv,Guidance For Teaching,75,70,wellinformed,"...es, presenting work that is meaningful, **w...",LTC,Spelling,90,'Well-informed' is a compound adjective that r...
4,Art and Design,wjec-gcse-art-and-design-specification.csv,Wjec Gcse Art And Design Specification,4,20,1 st,...released on the WJEC Portal on December **1...,LTC,Spelling,90,The ordinal indicator 'st' should be directly ...
5,Business,gcse-business---guidance-for-teaching-unit-2.csv,Guidance For Teaching Unit 2,0,7,hand written,...ust be submitted digitally (they may be **h...,LTC,Spelling,95,The compound adjective should be spelled as a ...
6,Business,gcse-business---guidance-for-teaching-unit-2.csv,Guidance For Teaching Unit 2,9,14,McDonalds,...s throughout the UK. Another example is **M...,LTC,Spelling,100,The proper noun should be spelled 'McDonald's'...
7,Business,gcse-business---guidance-for-teaching-unit-2.csv,Guidance For Teaching Unit 2,24,26,McDonalds,...o behaving in a sustainable manner: • [**M...,LTC,Spelling,95,The proper noun for the company is 'McDonald's...
8,Business,gcse-business---guidance-for-teaching-unit-2.csv,Guidance For Teaching Unit 2,25,26,McDonalds,...://www.youtube.com/watch?v=r-ctYi_eipk)[**M...,LTC,Spelling,95,The proper noun for the company is 'McDonald's...
9,Business,gcse-business---guidance-for-teaching-unit-2.csv,Guidance For Teaching Unit 2,33,37,aging,"...nging population structures, like an<br>**a...",LTC,Spelling,90,The British English spelling for the word is '...


In [None]:
# Filter deduped Spelling issues for Confidence Score >= 95 and show Issue, Context, and Confidence Score columns
import pandas as pd

# Ensure `spelling` exists and is deduplicated by Issue token
if 'spelling' not in globals():
    cols_lower = {c.lower(): c for c in issues.columns}
    if 'category' in cols_lower:
        cat_col = cols_lower['category']
    elif 'issue category' in cols_lower:
        cat_col = cols_lower['issue category']
    else:
        raise KeyError(f"No category-like column found. Columns: {issues.columns.tolist()}")
    spelling = issues[issues[cat_col].astype(str).str.casefold() == 'spelling'].copy()
    issue_col = cols_lower.get('issue')
    if issue_col:
        spelling['_issue_norm'] = (
            spelling[issue_col].astype(str).str.casefold().str.strip()
        )
        spelling = spelling.drop_duplicates(subset=['_issue_norm']).reset_index(drop=True)
        spelling = spelling.drop(columns=['_issue_norm'])
    else:
        spelling = spelling.drop_duplicates().reset_index(drop=True)

# Locate column names (case-insensitive)
cols_lower = {c.lower(): c for c in spelling.columns}
issue_col = cols_lower.get('issue')
context_col = cols_lower.get('context')
conf_col = cols_lower.get('confidence score') or cols_lower.get('confidence')

missing = [name for name, val in [('Issue', issue_col), ('Context', context_col), ('Confidence', conf_col)] if val is None]
if missing:
    raise KeyError(f"Missing expected columns: {missing}. Available columns: {spelling.columns.tolist()}")

# Filter and select the three columns
filtered = spelling[spelling[conf_col].astype(float) >= 95][[issue_col, context_col, conf_col]].reset_index(drop=True)

pd.options.display.max_rows = None
print(f"Displaying {len(filtered)} rows with columns: {filtered.columns.tolist()}")
display(filtered)

# expose for further use
spelling_conf95 = filtered

Displaying 1395 rows with columns: ['Issue', 'Context', 'Confidence Score']


Unnamed: 0,Issue,Context,Confidence Score
0,Paul Gaugin,...riate way). #### Examples could be: - **P...,100
1,Womens,...elationships for young people.<br>Welsh **W...,100
2,tryptichs,...of stories from the Bible (sometimes as **t...,100
3,hand written,...ust be submitted digitally (they may be **h...,95
4,McDonalds,...s throughout the UK. Another example is **M...,100
5,McDonalds,...o behaving in a sustainable manner: • [**M...,95
6,McDonalds,...://www.youtube.com/watch?v=r-ctYi_eipk)[**M...,95
7,decisionmaking,...ectives and feelings of others in their **d...,95
8,McDonalds,"...t to others.<br>For example, discussing **M...",100
9,proof-reading,"... punctuation and grammar, as<br>well as **p...",95
