# Stats Scratchpad

This notebook is where I explore the data before putting it into another notebook for public consumption.

In [21]:
# Shared setup
%run _shared/setup_data_overview.py

In [20]:
# Improved: Find and count 'leaner' or 'leaners' misspellings by Issue Category in issues_all
import pandas as pd

# Strip whitespace and search for 'leaner' or 'leaners' anywhere in the Issue string (case-insensitive)
df_leaners_all = issues_all[issues_all['Issue'].str.strip().str.contains(r'leaners?', case=False, na=False)]

print('Total rows with "leaner" or "leaners":', len(df_leaners_all))
print('Sample matches:')
display(df_leaners_all[['Issue', 'Issue Category']].head(10))

leaner_counts_all = (
    df_leaners_all
    .groupby(['Issue', 'Issue Category'])
    .size()
    .reset_index(name='Count')
    .sort_values(by='Count', ascending=False)
)

display(leaner_counts_all)
print('Sum of grouped counts:', leaner_counts_all['Count'].sum())

# Additional: Count the number of times "leaner" or "leaners" appears in each column of issues_all
def count_leaners_in_column(col):
    return issues_all[col].astype(str).str.count(r'leaners?', flags=re.IGNORECASE).sum()

import re
columns_to_check = issues_all.columns
for col in columns_to_check:
    count = count_leaners_in_column(col)
    print(f'Occurrences in column "{col}":', count)

Total rows with "leaner" or "leaners": 20
Sample matches:


Unnamed: 0,Issue,Issue Category
9352,leaners',Spelling
12606,Leaners,Spelling
12635,Leaners,Spelling
12693,leaners,Spelling
12717,Leaners,Spelling
12755,Leaners,Spelling
12804,leaners,Spelling
12820,Leaners,Spelling
12858,Leaners,Spelling
14158,Leaners,Spelling


Unnamed: 0,Issue,Issue Category,Count
0,Leaners,Spelling,13
1,leaners,Spelling,6
2,leaners',Spelling,1


Sum of grouped counts: 20
Occurrences in column "Subject": 0
Occurrences in column "File Name": 0
Occurrences in column "Document Name": 0
Occurrences in column "Issue ID": 0
Occurrences in column "Page Number": 0
Occurrences in column "Issue": 20
Occurrences in column "Context": 24
Occurrences in column "Pass Code": 0
Occurrences in column "Issue Category": 0
Occurrences in column "Confidence Score": 0
Occurrences in column "Reasoning": 15
