In [6]:
import fitz  # PyMuPDF
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import math
import os
from scipy import stats as scipy_stats

## Extract text from pdf
pdf_path = "Results.pdf"   
doc = fitz.open(pdf_path)

text_content = ""
for page in doc:
    text_content += page.get_text("text") + "\n"
doc.close()


## Parse into dataframe which is structured
lines = [ln.strip() for ln in text_content.splitlines() if ln.strip()]

header_patterns = [
    "Name Roll No Which slot are you attending for the examination ? (SELECT ONE)Marks Obtained",
    "Name Roll No Which slot are you attending for the examination ? (SELECT ONE) Marks Obtained",
    "Name Roll No Which slot are you attending for the examination? (SELECT ONE) Marks Obtained",
]
lines = [ln for ln in lines if ln not in header_patterns]

# Merge broken lines
merged = []
buffer = ""
for ln in lines:
    if buffer:
        candidate = buffer + " " + ln
    else:
        candidate = ln
    if re.search(r"\b\d{4,}\b", candidate) and re.search(r"September\s+(24th|26th)\b", candidate) and re.search(r"(absent|remote|\b\d{1,3}\b)\s*$", candidate, re.IGNORECASE):
        merged.append(candidate)
        buffer = ""
    else:
        if re.search(r"\b\d{4,}\b", ln) and re.search(r"September\s+(24th|26th)\b", ln) and re.search(r"(absent|remote|\b\d{1,3}\b)\s*$", ln, re.IGNORECASE):
            merged.append(ln)
            buffer = ""
        else:
            if buffer:
                buffer += " " + ln
            else:
                buffer = ln
if buffer and re.search(r"\b\d{4,}\b", buffer):
    merged.append(buffer)

pattern = re.compile(
    r"^(?P<name>.+?)\s+(?P<roll>\d{4,})\s+September\s+(?P<day>24th|26th)\s+(?P<last>\d{1,3}|absent|remote)\s*$",
    re.IGNORECASE
)

records = []
for ln in merged:
    m = pattern.search(ln)
    if m:
        name = m.group("name").strip()
        roll = m.group("roll").strip()
        slot_day = m.group("day").lower()
        last = m.group("last").lower()
        status = "present"
        marks = None
        if last.isdigit():
            marks = int(last)
        else:
            status = last
        records.append({
            "Name": name,
            "RollNo": roll,
            "Slot": f"September {slot_day}",
            "Marks": marks,
            "Status": status
        })


df = pd.DataFrame(records)
df["Attended"] = df["Status"].eq("present")
df_numeric = df[df["Attended"] & df["Marks"].notna()].copy()

# Do not save clean_results.csv as requested

## Summary Statistics and Plots
summary_overall = df_numeric["Marks"].describe()
summary_by_slot = df_numeric.groupby("Slot")["Marks"].describe()

print("\n=== Overall Summary ===")
print(summary_overall)
print("\n=== Summary by Slot ===")
print(summary_by_slot)

# Extended summary by slot (additional metrics) and export
os.makedirs("plots", exist_ok=True)

summary_rows = []
for slot, group in df.groupby("Slot"):
    total = len(group)
    attended = int(group["Attended"].sum())
    absent_remote = total - attended
    attendance_rate = attended / total if total > 0 else np.nan

    marks = group["Marks"].dropna().astype(float)
    n_numeric = len(marks)
    mean = marks.mean() if n_numeric > 0 else np.nan
    std = marks.std(ddof=1) if n_numeric > 1 else np.nan
    median = marks.median() if n_numeric > 0 else np.nan
    q1 = marks.quantile(0.25) if n_numeric > 0 else np.nan
    q3 = marks.quantile(0.75) if n_numeric > 0 else np.nan
    iqr = q3 - q1 if n_numeric > 0 else np.nan
    mn = marks.min() if n_numeric > 0 else np.nan
    mx = marks.max() if n_numeric > 0 else np.nan
    skewness = marks.skew() if n_numeric > 2 else np.nan
    kurt = marks.kurtosis() if n_numeric > 3 else np.nan
    # Compute Median Absolute Deviation (MAD) robustly
    if n_numeric > 0:
        med = np.median(marks.values)
        mad = np.median(np.abs(marks.values - med))
    else:
        mad = np.nan
    n_missing_marks = group["Marks"].isna().sum()

    # Outliers (IQR method) count
    outlier_count = 0
    if n_numeric > 0:
        lo = q1 - 1.5 * iqr
        hi = q3 + 1.5 * iqr
        outlier_count = ((marks < lo) | (marks > hi)).sum()

    summary_rows.append({
        "Slot": slot,
        "Total": total,
        "Attended": attended,
        "Absent_Remote": absent_remote,
        "Attendance_Rate": attendance_rate,
        "N_numeric": n_numeric,
        "Mean": mean,
        "Std": std,
        "Median": median,
        "Q1": q1,
        "Q3": q3,
        "IQR": iqr,
        "Min": mn,
        "Max": mx,
        "Skewness": skewness,
        "Kurtosis": kurt,
        "MAD": mad,
        "N_missing_marks": int(n_missing_marks),
        "Outlier_count_IQR": int(outlier_count)
    })

summary_by_slot_extended = pd.DataFrame(summary_rows).sort_values("Slot")
summary_by_slot_extended.to_csv("summary_by_slot_extended.csv", index=False)

print("\nSaved extended summary by slot to summary_by_slot_extended.csv")



=== Overall Summary ===
count    144.000000
mean      22.673611
std        5.990470
min        5.000000
25%       18.750000
50%       23.000000
75%       27.000000
max       35.000000
Name: Marks, dtype: float64

=== Summary by Slot ===
                count       mean       std  min   25%   50%   75%   max
Slot                                                                   
September 24th   45.0  22.066667  6.311605  8.0  17.0  22.0  27.0  33.0
September 26th   99.0  22.949495  5.850822  5.0  19.0  23.0  27.0  35.0

Saved extended summary by slot to summary_by_slot_extended.csv
