---
hide:
  - navigation
  - toc
---

In [None]:
# Shared setup
%run _shared/setup_data_overview.py

# Proofreader Data Overview

This count of issues shows a count of all spelling, grammar, factual accuracy and consistency issues found across the entire corpus with a confidence score of 0.8 or higher. This gives us a total count of the issues that the LLM is most confident are real issues.

In [None]:
### Total Issues

In [None]:
from IPython.display import Markdown, display

# Filter issues by category and confidence score >= 80
filtered_issues = issues[
    (issues['Issue Category'].isin(['Spelling', 'Grammar', 'Factual', 'Consistency'])) &
    (issues['Confidence Score'] >= 80)
]

# Count issues by category
issue_counts = filtered_issues['Issue Category'].value_counts(
).sort_values(ascending=False)
summary_table = pd.DataFrame({
    'Issue Category': issue_counts.index,
    'Count': issue_counts.values
})

display(summary_table)
display(
    Markdown(f"**Total issues (with confidence ≥ 80):** {len(filtered_issues)}"))

In [None]:
# Create an interactive bar chart using Plotly
import plotly.express as px
from IPython.display import HTML, display

fig = px.bar(
    summary_table,
    x='Issue Category',
    y='Count',
    title='Issue Count by Category (Confidence Score ≥ 80)',
    labels={'Issue Category': 'Category', 'Count': 'Number of Issues'},
    color='Count',
    color_continuous_scale='Viridis',
    text='Count'
)

# Update layout for better appearance
fig.update_traces(textposition='outside')
fig.update_layout(
    height=500,
    showlegend=False,
    xaxis_tickangle=-45,
    hovermode='x unified'
)

# Emit as HTML so mkdocs-jupyter/MkDocs can render it
display(HTML(fig.to_html(full_html=False, include_plotlyjs='cdn')))

## Issues Per Page by Document


In [None]:
from IPython.display import Markdown, display

# Calculate average issues per page across entire corpus
total_issues = len(filtered_issues)
total_pages = files['Pages'].sum()
avg_issues_per_page_overall = total_issues / total_pages

display(Markdown(f"""### Average Issues Per Page (Entire Corpus):

- **Total Issues**: {total_issues}
- **Total Pages**: {total_pages}
- **Average Issues Per Page**: {avg_issues_per_page_overall:.4f}
"""))

## Average Issues Per Page by Subject


In [None]:
from IPython.display import Markdown, display

# Calculate average issues per page by subject
subject_issues = filtered_issues.groupby(
    'Subject').size().reset_index(name='Issue Count')

# Normalize subject names in files (replace hyphens with spaces)
files_normalized = files.copy()
files_normalized['Subject'] = files_normalized['Subject'].str.replace('-', ' ')

subject_pages = files_normalized.groupby(
    'Subject')['Pages'].sum().reset_index(name='Total Pages')

subject_stats = subject_issues.merge(subject_pages, on='Subject', how='left')
subject_stats['Issues Per Page'] = (
    subject_stats['Issue Count'] / subject_stats['Total Pages']).round(2)
subject_stats = subject_stats.sort_values('Issues Per Page', ascending=False)

display(subject_stats[['Subject', 'Issue Count',
        'Total Pages', 'Issues Per Page']])

In [None]:
# Create a horizontal bar chart sorted by issues per page
# Reverse sort for chart display (ascending) so highest values appear at top
import plotly.express as px
from IPython.display import HTML, display

chart_data = subject_stats.sort_values('Issues Per Page', ascending=True)

fig_subject = px.bar(
    chart_data,
    x='Issues Per Page',
    y='Subject',
    orientation='h',
    title='Average Issues Per Page by Subject (Highest to Lowest)',
    labels={'Issues Per Page': 'Issues Per Page', 'Subject': 'Subject'},
    color='Issues Per Page',
    color_continuous_scale='Viridis',
    text='Issues Per Page'
)

# Update layout for better appearance
fig_subject.update_traces(textposition='outside')
fig_subject.update_layout(
    height=700,
    showlegend=False,
    hovermode='y unified'
)

# Emit as HTML so mkdocs-jupyter/MkDocs can render it
display(HTML(fig_subject.to_html(full_html=False, include_plotlyjs='cdn')))

## Sample Issues by Category


In [None]:
import numpy as np
import pandas as pd
from IPython.display import Markdown, display

# Filter issues by category and confidence score >= 80
filtered_issues_sample = issues[
    (issues['Issue Category'].isin(['Spelling', 'Grammar', 'Factual', 'Consistency'])) &
    (issues['Confidence Score'] >= 80)
] .copy()

# Sample 2 issues from each category
sample_issues = []
for category in ['Spelling', 'Grammar', 'Factual', 'Consistency']:
    category_issues = filtered_issues_sample[filtered_issues_sample['Issue Category'] == category]
    if len(category_issues) >= 2:
        sampled = category_issues.sample(
            n=min(2, len(category_issues)), random_state=42)
    else:
        sampled = category_issues
    sample_issues.append(sampled)

sample_df = pd.concat(sample_issues, ignore_index=True)


def _md_escape_table_cell(value: object) -> str:
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return "N/A"
    text = str(value)
    text = text.replace("|", "\\|")
    text = " ".join(text.splitlines()).strip()
    return text


# Emit a *real* Markdown output (not stdout) so MkDocs can render it
md_lines: list[str] = []
md_lines.append("## Two Random Issues from Each Category")
md_lines.append("")
md_lines.append(
    "| Subject | Document | Pg | Context | Issue | Confidence | Reasoning      |")
md_lines.append(
    "|---------|----------|----|---------|-------|------------|----------------|")

for _, row in sample_df.iterrows():
    subject = _md_escape_table_cell(row.get('Subject'))
    doc_name = _md_escape_table_cell(row.get('Document Name'))
    page_num = _md_escape_table_cell(row.get('Page Number'))
    category = _md_escape_table_cell(row.get('Issue Category'))
    conf_score = _md_escape_table_cell(row.get('Confidence Score'))
    reasoning = _md_escape_table_cell(row.get('Reasoning'))
    context = _md_escape_table_cell(row.get('Context'))
    md_lines.append(
        f"| {subject} | {doc_name} | {page_num} | {context} | {category} | {conf_score} | {reasoning} |")

md_lines.append("")
md_lines.append(f"Total issues displayed: {len(sample_df)}")

display(Markdown("\n".join(md_lines)))