In [38]:
# Shared setup
%run _shared/setup_data_overview.py

In [42]:
# Query for rows where the 'Reasoning' column contains 'comma' (case-insensitive)
regex = "comma"
issues_with_comma = issues.query("Reasoning.str.contains(@regex, case=False, na=False)", engine="python")

from IPython.display import HTML, display

commas_text = f"""
<div style="text-align:center">
  <h2>They can't use commas</h2>
  <p><em>They got that wrong <strong>{len(issues_with_comma)}</strong> times</em></p>
</div>
"""

display(HTML(commas_text))

In [45]:
# Query for rows where the 'Reasoning' column contains 'comma' (case-insensitive)
regex = "apostrophe"
issues_with_apostrophes = issues.query("Reasoning.str.contains(@regex, case=False, na=False)", engine="python")

from IPython.display import HTML, display

apostrophes_text = f"""
<div style="text-align:center">
  <h2>They aren't much better with apostrophes</h2>
  <p><em>They got that wrong <strong>{len(issues_with_apostrophes)}</strong> times</em></p>
</div>
"""

display(HTML(apostrophes_text))

In [40]:
# Compute total spelling issues flagged by Language Tool with LTC pass code
language_tool_spelling_mistakes = issues.query(
    "`Issue Category` == 'Spelling' and `Pass Code` == 'LTC'"
).shape[0]

language_tool_grammar_issues = issues.query(
    "`Issue Category` == 'Grammar' and `Pass Code` == 'LTC'"
).shape[0]

# Count issues excluding False Positives and Parsing Errors
total_issues = issues[~issues['Issue Category'].isin(['False Positive', 'Parsing Error'])].shape[0]



# Define the Key Findings text as an f-string variable so it can be reused elsewhere
key_findings_text = f"""# Key Findings

The WJEC are not performing any meaninngful quality assurance on their work.

Before we get into the nitty-gritty of the **{total_issues}** issues discovered by the entire process. Let's take a look at the issues that would have been highlighted by Microsoft Word's spell-checker.

I found **{language_tool_spelling_mistakes}** spelling mistakes, and **{language_tool_grammar_issues}** grammatical errors.

These are issues that would have been *highlighted* with red and blue squiggles in 
Word. 

Somehow, 4-5 people, all educated to post-graduate level, managed to miss them.

Some subjects are better than others when it comes to spotting squiggly lines in a Word document:
"""

display(Markdown(key_findings_text))

# Key Findings

The WJEC are not performing any meaninngful quality assurance on their work.

Before we get into the nitty-gritty of the **15942** issues discovered by the entire process. Let's take a look at the issues that would have been highlighted by Microsoft Word's spell-checker.

I found **329** spelling mistakes, and **351** grammatical errors.

These are issues that would have been *highlighted* with red and blue squiggles in 
Word. 

Somehow, 4-5 people, all educated to post-graduate level, managed to miss them.

Some subjects are better than others when it comes to spotting squiggly lines in a Word document:


In [41]:
from IPython.display import HTML, display
import plotly.express as px

# Compute average errors per document by subject for Pass Code = 'LTC' and categories Spelling & Grammar
ltc_spelling_grammar = issues.query("`Issue Category` in ['Spelling','Grammar'] and `Pass Code` == 'LTC'").copy()
# Normalise subject names to match `files` where hyphens are replaced with spaces
ltc_spelling_grammar['Subject'] = ltc_spelling_grammar['Subject'].str.replace('-', ' ')

# Count errors per document
doc_counts = (
    ltc_spelling_grammar.groupby(['Subject', 'Document Name'])
    .size()
    .reset_index(name='Errors')
)

# Average errors per document by subject
avg_errors = (
    doc_counts.groupby('Subject')['Errors']
    .mean()
    .reset_index(name='Errors Per Document')
)

# Ensure all subjects in `files` are present (fill missing with 0)
subjects_all = files['Subject'].str.replace('-', ' ').unique()
avg_errors = (
    pd.DataFrame({'Subject': subjects_all})
    .merge(avg_errors, on='Subject', how='left')
    .fillna({'Errors Per Document': 0})
)

avg_errors['Errors Per Document'] = avg_errors['Errors Per Document'].round(2)
chart_data = avg_errors.sort_values('Errors Per Document', ascending=True)

fig = px.bar(
    chart_data,
    x='Errors Per Document',
    y='Subject',
    orientation='h',
    title='Average number of red and blue sqiggly lines missed per document, by subject',
    labels={'Errors Per Document': 'Errors Per Document', 'Subject': 'Subject'},
    color='Errors Per Document',
    # Custom sequential scale: green (low) -> yellow (mid) -> red (high)
    color_continuous_scale=['#2ca02c', '#ffffbf', '#d62728'],
    text='Errors Per Document'
)

# Update layout for consistent appearance with other charts
fig.update_traces(textposition='outside', marker_showscale=False)
fig.update_layout(
    height=600,
    showlegend=False,
    hovermode='y unified',
    coloraxis_showscale=False
)

# Emit as HTML so mkdocs-jupyter/MkDocs can render it
display(HTML(fig.to_html(full_html=False, include_plotlyjs='cdn')))


FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/WjecDocumentScraper/.venv/lib/python3.12/site-packages/plotly/package_data/plotly.min.js'

In [None]:
### Total Issues

In [None]:
from IPython.display import Markdown, display

# Filter issues by category and confidence score >= 80
filtered_issues = issues[
    (issues['Issue Category'].isin(['Spelling', 'Grammar', 'Factual', 'Consistency'])) &
    (issues['Confidence Score'] >= 80)
]

# Count issues by category
issue_counts = filtered_issues['Issue Category'].value_counts(
).sort_values(ascending=False)
summary_table = pd.DataFrame({
    'Issue Category': issue_counts.index,
    'Count': issue_counts.values
})

display(summary_table)
display(
    Markdown(f"**Total issues (with confidence ≥ 80):** {len(filtered_issues)}"))

Unnamed: 0,Issue Category,Count
0,Grammar,6757
1,Consistency,2152
2,Spelling,1613
3,Factual,306


**Total issues (with confidence ≥ 80):** 10828

In [None]:
# Create an interactive bar chart using Plotly
import plotly.express as px
from IPython.display import HTML, display

fig = px.bar(
    summary_table,
    x='Issue Category',
    y='Count',
    title='Issue Count by Category (Confidence Score ≥ 80)',
    labels={'Issue Category': 'Category', 'Count': 'Number of Issues'},
    color='Count',
    color_continuous_scale='Viridis',
    text='Count'
)

# Update layout for better appearance
fig.update_traces(textposition='outside')
fig.update_layout(
    height=500,
    showlegend=False,
    xaxis_tickangle=-45,
    hovermode='x unified'
)

# Emit as HTML so mkdocs-jupyter/MkDocs can render it
display(HTML(fig.to_html(full_html=False, include_plotlyjs='cdn')))

### Average Issues Per Page (Entire Corpus):

In [None]:
from IPython.display import Markdown, display

# Calculate average issues per page across entire corpus
total_issues = len(filtered_issues)
total_pages = files['Pages'].sum()
avg_issues_per_page_overall = total_issues / total_pages

display(Markdown(f"""
- **Total Issues**: {total_issues}
- **Total Pages**: {total_pages}
- **Average Issues Per Page**: {avg_issues_per_page_overall:.4f}
"""))


- **Total Issues**: 10828
- **Total Pages**: 7776
- **Average Issues Per Page**: 1.3925


## Average Issues Per Page by Subject


In [None]:
from IPython.display import Markdown, display

# Calculate average issues per page by subject
subject_issues = filtered_issues.groupby(
    'Subject').size().reset_index(name='Issue Count')

# Normalize subject names in files (replace hyphens with spaces)
files_normalized = files.copy()
files_normalized['Subject'] = files_normalized['Subject'].str.replace('-', ' ')

subject_pages = files_normalized.groupby(
    'Subject')['Pages'].sum().reset_index(name='Total Pages')

subject_stats = subject_issues.merge(subject_pages, on='Subject', how='left')
subject_stats['Issues Per Page'] = (
    subject_stats['Issue Count'] / subject_stats['Total Pages']).round(2)
subject_stats = subject_stats.sort_values('Issues Per Page', ascending=False)

display(subject_stats[['Subject', 'Issue Count',
        'Total Pages', 'Issues Per Page']])

Unnamed: 0,Subject,Issue Count,Total Pages,Issues Per Page
3,Dance,153,34,4.5
4,Design and Technology,211,53,3.98
2,Computer Science,1243,344,3.61
6,Digital Technology,213,62,3.44
1,Business,1339,422,3.17
5,Digital Media and Film,88,39,2.26
0,Art and Design,453,202,2.24
7,Drama,1210,539,2.24
15,Integrated Science Single Award,67,53,1.26
20,Religious Studies,938,788,1.19


In [None]:
# Create a horizontal bar chart sorted by issues per page
# Reverse sort for chart display (ascending) so highest values appear at top
import plotly.express as px
from IPython.display import HTML, display

chart_data = subject_stats.sort_values('Issues Per Page', ascending=True)

fig_subject = px.bar(
    chart_data,
    x='Issues Per Page',
    y='Subject',
    orientation='h',
    title='Average Issues Per Page by Subject (Highest to Lowest)',
    labels={'Issues Per Page': 'Issues Per Page', 'Subject': 'Subject'},
    color='Issues Per Page',
    color_continuous_scale='Viridis',
    text='Issues Per Page'
)

# Update layout for better appearance
fig_subject.update_traces(textposition='outside')
fig_subject.update_layout(
    height=700,
    showlegend=False,
    hovermode='y unified'
)

# Emit as HTML so mkdocs-jupyter/MkDocs can render it
display(HTML(fig_subject.to_html(full_html=False, include_plotlyjs='cdn')))

## Sample Issues by Category


In [None]:
import numpy as np
import pandas as pd
from IPython.display import Markdown, display

# Filter issues by category and confidence score >= 80
filtered_issues_sample = issues[
    (issues['Issue Category'].isin(['Spelling', 'Grammar', 'Factual', 'Consistency'])) &
    (issues['Confidence Score'] >= 80)
] .copy()

# Sample 2 issues from each category
sample_issues = []
for category in ['Spelling', 'Grammar', 'Factual', 'Consistency']:
    category_issues = filtered_issues_sample[filtered_issues_sample['Issue Category'] == category]
    if len(category_issues) >= 2:
        sampled = category_issues.sample(
            n=min(2, len(category_issues)), random_state=42)
    else:
        sampled = category_issues
    sample_issues.append(sampled)

sample_df = pd.concat(sample_issues, ignore_index=True)


def _md_escape_table_cell(value: object) -> str:
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return "N/A"
    text = str(value)
    text = text.replace("|", "\\|")
    text = " ".join(text.splitlines()).strip()
    return text


# Emit a *real* Markdown output (not stdout) so MkDocs can render it
md_lines: list[str] = []
md_lines.append("## Two Random Issues from Each Category")
md_lines.append("")
md_lines.append(
    "| Subject | Document | Pg | Context | Issue | Confidence | Reasoning      |")
md_lines.append(
    "|---------|----------|----|---------|-------|------------|----------------|")

for _, row in sample_df.iterrows():
    subject = _md_escape_table_cell(row.get('Subject'))
    doc_name = _md_escape_table_cell(row.get('Document Name'))
    page_num = _md_escape_table_cell(row.get('Page Number'))
    category = _md_escape_table_cell(row.get('Issue Category'))
    conf_score = _md_escape_table_cell(row.get('Confidence Score'))
    reasoning = _md_escape_table_cell(row.get('Reasoning'))
    context = _md_escape_table_cell(row.get('Context'))
    md_lines.append(
        f"| {subject} | {doc_name} | {page_num} | {context} | {category} | {conf_score} | {reasoning} |")

md_lines.append("")
md_lines.append(f"Total issues displayed: {len(sample_df)}")

display(Markdown("\n".join(md_lines)))

## Two Random Issues from Each Category

| Subject | Document | Pg | Context | Issue | Confidence | Reasoning      |
|---------|----------|----|---------|-------|------------|----------------|
| Computer Science | Unit 1 Understanding Computer Science | 27 | ...wing:<br>2<br>2<br>•<br>Error: total is **iteger**<br>•<br>total is integer<br>Change: \|  ... | Spelling | 100 | "iteger" is a misspelling of the word "integer" as confirmed by the provided correction in the context. |
| Mathematics and Numeracy | January 2025 | 1 | ...hese events will be led by WJEC Subject **Advisors** and our expert presenting teams. \| \| Ex... | Spelling | 90 | The British English spelling for a person who gives advice is 'advisers', not 'advisors', as confirmed by OED and Collins. |
| Religious Studies | Unit 4 Non Exam Assessment Pack | 17 | Markers are highly experienced subject **specialists they** may have read the source you are using or even marked the work you have copied from! | Grammar | 98 | This is a comma splice, joining two independent clauses without appropriate punctuation or a conjunction. |
| Computer Science | Guidance For Teaching Unit 1 | 124 | which shares a common communication path. **Ranges from**<br>5km to 50km. | Grammar | 90 | This is a sentence fragment; it should be integrated into the preceding sentence (e.g., 'and ranges from...') or rephrased as a complete sentence. |
| Geography | Guidance For Teaching Unit 3 | 29 | Low pressure events could be tropical storms, hurricanes, cyclones, **monsoon**. | Factual | 95 | A monsoon is a seasonal wind pattern, not an extreme low-pressure weather event in the same category as tropical storms, hurricanes, or cyclones. |
| Food and Nutrition | Guidance For Teaching Unit 2 | 30 | **A liquid which is dispersed into a solid.** | Factual | 98 | A gel is a colloid where a liquid is dispersed *within* a solid medium, not a liquid dispersed *into* a solid. |
| History | Guidance For Teaching Unit 1 Option 1.5 | 21 | Book(s): Royle, **E and Lockyer**, R. 1997. *Chartism* , Chapters 1 | Consistency | 85 | For consistency with other author initial formatting (e.g., 'Williams, D.'), the initial 'E' should be followed by a full stop. |
| History | Guidance For Teaching Unit 3 Option 3.6 | 9 | or How did forced relocations affect<br>**Native communities**? | Consistency | 85 | The document capitalises 'Indigenous Peoples' and 'Native Americans'. For consistency when referring to indigenous groups, 'Native communities' should also be capitalised. |

Total issues displayed: 8