In [35]:
# Shared setup
%run _shared/setup_data_overview.py

# Proofreader Data Overview

## Significant issues count across the whole dataset

This count of issues shows a count of all spelling, grammar, factual accuracy and consistency issues found across the entire corpus with a confidence score of 0.8 or higher. This gives us a total count of the issues that the LLM is most confident are real issues.

In [36]:
# Filter issues by category and confidence score >= 80
filtered_issues = issues[
    (issues['Issue Category'].isin(['Spelling', 'Grammar', 'Factual', 'Consistency'])) &
    (issues['Confidence Score'] >= 80)
]

# Count issues by category
issue_counts = filtered_issues['Issue Category'].value_counts().sort_values(ascending=False)
summary_table = pd.DataFrame({
    'Issue Category': issue_counts.index,
    'Count': issue_counts.values
})

print(summary_table)
print(f"\nTotal issues (with confidence >= 80): {len(filtered_issues)}")

  Issue Category  Count
0        Grammar   6757
1    Consistency   2152
2       Spelling   1613
3        Factual    306

Total issues (with confidence >= 80): 10828


In [37]:
# Create an interactive bar chart using Plotly
import plotly.express as px

fig = px.bar(
    summary_table,
    x='Issue Category',
    y='Count',
    title='Issue Count by Category (Confidence Score ≥ 80)',
    labels={'Issue Category': 'Category', 'Count': 'Number of Issues'},
    color='Count',
    color_continuous_scale='Viridis',
    text='Count'
)

# Update layout for better appearance
fig.update_traces(textposition='outside')
fig.update_layout(
    height=500,
    showlegend=False,
    xaxis_tickangle=-45,
    hovermode='x unified'
)

fig.show()

In [38]:
# Check available columns in issues dataframe
print("Available columns in issues dataframe:")
print(issues.columns.tolist())
print("\nFirst few rows:")
print(issues.head(2))


Available columns in issues dataframe:
['Subject', 'File Name', 'Document Name', 'Issue ID', 'Page Number', 'Issue', 'Context', 'Pass Code', 'Issue Category', 'Confidence Score', 'Reasoning']

First few rows:
          Subject                                 File Name   Document Name  \
0  Art and Design  gcse-art-and-design---delivery-guide.csv  Delivery Guide   
1  Art and Design  gcse-art-and-design---delivery-guide.csv  Delivery Guide   

   Issue ID  Page Number Issue  \
0         0           11   and   
1         1           14     (   

                                             Context Pass Code Issue Category  \
0  ...pected to demonstrate in their responses** ...       LTC      Stylistic   
1  ...---------| | First assessment for Unit 1**(...       LTC  Parsing Error   

   Confidence Score                                          Reasoning  
0                80  The comma before 'and' joining two independent...  
1                95  A space is missing between 'Unit 1' and

## Issues Per Page by Document

In [39]:
# Calculate average issues per page across entire corpus
total_issues = len(filtered_issues)
total_pages = files['Pages'].sum()
avg_issues_per_page_overall = total_issues / total_pages

print("Average Issues Per Page (Entire Corpus):")
print(f"Total Issues: {total_issues}")
print(f"Total Pages: {total_pages}")
print(f"Average Issues Per Page: {avg_issues_per_page_overall:.4f}")
print()

Average Issues Per Page (Entire Corpus):
Total Issues: 10828
Total Pages: 7776
Average Issues Per Page: 1.3925



## Average Issues Per Page by Subject

In [40]:
# Calculate average issues per page by subject
subject_issues = filtered_issues.groupby('Subject').size().reset_index(name='Issue Count')

# Normalize subject names in files (replace hyphens with spaces)
files_normalized = files.copy()
files_normalized['Subject'] = files_normalized['Subject'].str.replace('-', ' ')

subject_pages = files_normalized.groupby('Subject')['Pages'].sum().reset_index(name='Total Pages')

subject_stats = subject_issues.merge(subject_pages, on='Subject', how='left')
subject_stats['Issues Per Page'] = (subject_stats['Issue Count'] / subject_stats['Total Pages']).round(2)
subject_stats = subject_stats.sort_values('Issues Per Page', ascending=False)

print("Average Issues Per Page by Subject (Highest to Lowest):")
print(subject_stats[['Subject', 'Issue Count', 'Total Pages', 'Issues Per Page']])
print()

Average Issues Per Page by Subject (Highest to Lowest):
                                 Subject  Issue Count  Total Pages  \
3                                  Dance          153           34   
4                  Design and Technology          211           53   
2                       Computer Science         1243          344   
6                     Digital Technology          213           62   
1                               Business         1339          422   
5                 Digital Media and Film           88           39   
0                         Art and Design          453          202   
7                                  Drama         1210          539   
15       Integrated Science Single Award           67           53   
20                     Religious Studies          938          788   
9                     Food and Nutrition          545          484   
13  Health and Social Care and Childcare           50           45   
11                             Geo

In [41]:
# Create a horizontal bar chart sorted by issues per page
# Reverse sort for chart display (ascending) so highest values appear at top
chart_data = subject_stats.sort_values('Issues Per Page', ascending=True)

fig_subject = px.bar(
    chart_data,
    x='Issues Per Page',
    y='Subject',
    orientation='h',
    title='Average Issues Per Page by Subject (Highest to Lowest)',
    labels={'Issues Per Page': 'Issues Per Page', 'Subject': 'Subject'},
    color='Issues Per Page',
    color_continuous_scale='Viridis',
    text='Issues Per Page'
)

# Update layout for better appearance
fig_subject.update_traces(textposition='outside')
fig_subject.update_layout(
    height=700,
    showlegend=False,
    hovermode='y unified'
)

fig_subject.show()

## Sample Issues by Category

In [42]:
import numpy as np
import pandas as pd

# Filter issues by category and confidence score >= 80
filtered_issues_sample = issues[
    (issues['Issue Category'].isin(['Spelling', 'Grammar', 'Factual', 'Consistency'])) &
    (issues['Confidence Score'] >= 80)
].copy()

# Sample 2 issues from each category
sample_issues = []
for category in ['Spelling', 'Grammar', 'Factual', 'Consistency']:
    category_issues = filtered_issues_sample[filtered_issues_sample['Issue Category'] == category]
    if len(category_issues) >= 2:
        sampled = category_issues.sample(n=min(2, len(category_issues)), random_state=42)
    else:
        sampled = category_issues
    sample_issues.append(sampled)

sample_df = pd.concat(sample_issues, ignore_index=True)

# Create a formatted markdown table
print("## Two Random Issues from Each Category\n")
print("| Subject | Document Name | Page Number | Context | Issue Category | Confidence Score | Reasoning |")
print("|---------|---------------|------------|---------|----------------|------------------|-----------|")

for idx, row in sample_df.iterrows():
    # Format context as markdown - wrap in backticks
    context = f"`{row['Context']}`" if pd.notna(row['Context']) else "N/A"
    
    # Clean up text for table display
    subject = str(row['Subject']).replace('|', '\\|') if pd.notna(row['Subject']) else "N/A"
    doc_name = str(row['Document Name']).replace('|', '\\|') if pd.notna(row['Document Name']) else "N/A"
    page_num = str(row['Page Number']) if pd.notna(row['Page Number']) else "N/A"
    category = str(row['Issue Category']).replace('|', '\\|') if pd.notna(row['Issue Category']) else "N/A"
    conf_score = str(row['Confidence Score']) if pd.notna(row['Confidence Score']) else "N/A"
    reasoning = str(row['Reasoning']).replace('|', '\\|') if pd.notna(row['Reasoning']) else "N/A"
    
    print(f"| {subject} | {doc_name} | {page_num} | {context} | {category} | {conf_score} | {reasoning} |")

print(f"\nTotal issues displayed: {len(sample_df)}")


## Two Random Issues from Each Category

| Subject | Document Name | Page Number | Context | Issue Category | Confidence Score | Reasoning |
|---------|---------------|------------|---------|----------------|------------------|-----------|
| Computer Science | Unit 1 Understanding Computer Science | 27 | `...wing:<br>2<br>2<br>•<br>Error: total is **iteger**<br>•<br>total is integer<br>Change: |  ...` | Spelling | 100 | "iteger" is a misspelling of the word "integer" as confirmed by the provided correction in the context. |
| Mathematics and Numeracy | January 2025 | 1 | `...hese events will be led by WJEC Subject **Advisors** and our expert presenting teams. | | Ex...` | Spelling | 90 | The British English spelling for a person who gives advice is 'advisers', not 'advisors', as confirmed by OED and Collins. |
| Religious Studies | Unit 4 Non Exam Assessment Pack | 17 | `Markers are highly experienced subject **specialists they** may have read the source you are using or even marked th