In [2]:
import nfl_data_py as nfl
import pandas as pd
import os
import traceback

base_dir = r"C:\Repos\NFL_Draft\data"

def save_parquet(df, folder_name, file_name):
    """Save a DataFrame to Parquet format with automatic type coercion and logging."""
    try:
        # Create folder if needed
        folder_path = os.path.join(base_dir, folder_name)
        os.makedirs(folder_path, exist_ok=True)

        # Coerce object-type numeric columns to avoid Arrow errors
        for col in df.columns:
            if df[col].dtype == "object" and df[col].str.isnumeric().sum() > 0:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        file_path = os.path.join(folder_path, f"{file_name}.parquet")
        df.to_parquet(file_path, index=False)
        print(f"✅ Saved: {file_name} ({df.shape[0]:,} rows, {df.shape[1]} columns)")

    except Exception as e:
        print(f"❌ Failed to save {file_name}: {e}")
        traceback.print_exc()

# Define range of seasons
years = list(range(1999, 2025))

# ----------- Main Extraction -----------

try:
    weekly_stats = nfl.import_weekly_data(years)
    save_parquet(weekly_stats, "weekly_data", "weekly_stats")
except Exception as e:
    print("❌ Error downloading weekly_stats:", e)

try:
    seasonal_stats = nfl.import_seasonal_data(years)
    save_parquet(seasonal_stats, "seasonal_data", "seasonal_stats")
except Exception as e:
    print("❌ Error downloading seasonal_stats:", e)

try:
    seasonal_rosters = nfl.import_seasonal_rosters(years)
    save_parquet(seasonal_rosters, "seasonal_rosters", "seasonal_rosters")
except Exception as e:
    print("❌ Error downloading seasonal_rosters:", e)

try:
    combine_data = nfl.import_combine_data()
    save_parquet(combine_data, "combine_data", "combine_data")
except Exception as e:
    print("❌ Error downloading combine_data:", e)

try:
    draft_picks = nfl.import_draft_picks(years)
    save_parquet(draft_picks, "draft_picks", "draft_picks")
except Exception as e:
    print("❌ Error downloading draft_picks:", e)

try:
    team_info = nfl.import_team_desc()
    save_parquet(team_info, "team_info", "team_info")
except Exception as e:
    print("❌ Error downloading team_info:", e)

try:
    draft_values = nfl.import_draft_values()
    save_parquet(draft_values, "draft_values", "draft_values")
except Exception as e:
    print("❌ Error downloading draft_values:", e)

try:
    id_mappings = nfl.import_ids()
    save_parquet(id_mappings, "id_mappings", "id_mappings")
except Exception as e:
    print("❌ Error downloading id_mappings:", e)


Downcasting floats.
✅ Saved: weekly_stats (134,470 rows, 53 columns)
✅ Saved: seasonal_stats (15,102 rows, 58 columns)
✅ Saved: seasonal_rosters (63,323 rows, 37 columns)
✅ Saved: combine_data (8,649 rows, 18 columns)
✅ Saved: draft_picks (6,640 rows, 36 columns)
✅ Saved: team_info (36 rows, 16 columns)
✅ Saved: draft_values (262 rows, 6 columns)
✅ Saved: id_mappings (12,023 rows, 35 columns)


In [4]:
import os
import pandas as pd

base_dir = r"C:\Repos\NFL_Draft\data"

def summarize_parquet(path):
    df = pd.read_parquet(path)
    summary = pd.DataFrame({
        "column": df.columns,
        "dtype": [str(df[col].dtype) for col in df.columns],
        "nullable": [df[col].isnull().any() for col in df.columns],
        "null_pct": [df[col].isnull().mean() for col in df.columns],
        "unique_vals": [df[col].nunique(dropna=True) for col in df.columns],
        "example": [df[col].dropna().iloc[0] if not df[col].dropna().empty else None for col in df.columns],
    })
    summary.insert(0, "table", os.path.basename(path).replace(".parquet", ""))
    return summary

# Collect all .parquet files
parquet_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".parquet"):
            parquet_files.append(os.path.join(root, file))

# Generate and export summary
if parquet_files:
    summaries = pd.concat([summarize_parquet(f) for f in parquet_files], ignore_index=True)
    summaries.to_csv(os.path.join(base_dir, "parquet_summary.csv"), index=False)
    print("✅ Summary saved to parquet_summary.csv")
else:
    print("⚠️ No .parquet files found.")


✅ Summary saved to parquet_summary.csv


In [None]:
# --- Add project root to sys.path if running from subfolder ---
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from config import COMBINE_DIR


# --- Cleaning Function ---
def clean_combine_data(df):
    df = df.copy()

    # Normalize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Replace placeholder strings
    df.replace(["--", "N/A", ""], pd.NA, inplace=True)

    # Coerce numeric columns
    for col in ["draft_year", "draft_round", "draft_ovr", "jersey_number"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Season to nullable integer
    if "season" in df.columns:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")

    # Clean categorical text columns
    for col in ["draft_team", "position"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

    return df


# --- Load, Clean, Save ---
combine_path = os.path.join(COMBINE_DIR, "combine_data.parquet")
combine_cleaned_path = os.path.join(COMBINE_DIR, "combine_data_cleaned.parquet")

df_combine = pd.read_parquet(combine_path)
df_combine_cleaned = clean_combine_data(df_combine)
df_combine_cleaned.to_parquet(combine_cleaned_path, index=False)

print("✅ combine_data cleaned and saved.")

✅ combine_data cleaned and saved.


In [11]:
# --- Add project root to sys.path if running from misc/ ---
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from config import DRAFT_DIR


# --- Cleaning Function ---
def clean_draft_picks(df):
    df = df.copy()

    # Standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Replace placeholder strings
    df.replace(["--", "N/A", ""], pd.NA, inplace=True)

    # Coerce integer-like floats to Int64
    for col in ["draft_round", "draft_pick", "overall_pick", "season"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    # Strip text fields
    for col in ["player_name", "position", "team", "college"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

    return df


# --- Load, Clean, Save ---
draft_path = os.path.join(DRAFT_DIR, "draft_picks.parquet")
draft_cleaned_path = os.path.join(DRAFT_DIR, "draft_picks_cleaned.parquet")

df_draft = pd.read_parquet(draft_path)
df_draft_cleaned = clean_draft_picks(df_draft)
df_draft_cleaned.to_parquet(draft_cleaned_path, index=False)

print("✅ draft_picks cleaned and saved.")


✅ draft_picks cleaned and saved.


In [12]:
# --- Add project root if needed ---
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from config import DRAFT_VALUE_DIR


# --- Cleaning Function ---
def clean_draft_values(df):
    df = df.copy()

    # Standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Clean placeholder values
    df.replace(["--", "N/A", ""], pd.NA, inplace=True)

    # Cast pick to Int64 if it's a float with integer values
    if "pick" in df.columns:
        df["pick"] = pd.to_numeric(df["pick"], errors="coerce").astype("Int64")

    # Ensure chart column is clean string
    if "chart" in df.columns:
        df["chart"] = df["chart"].astype(str).str.strip()

    return df


# --- Load, Clean, Save ---
values_path = os.path.join(DRAFT_VALUE_DIR, "draft_values.parquet")
values_cleaned_path = os.path.join(DRAFT_VALUE_DIR, "draft_values_cleaned.parquet")

df_values = pd.read_parquet(values_path)
df_values_cleaned = clean_draft_values(df_values)
df_values_cleaned.to_parquet(values_cleaned_path, index=False)

print("✅ draft_values cleaned and saved.")


✅ draft_values cleaned and saved.


In [13]:
# --- Add project root if needed ---
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from config import ID_MAP_DIR


# --- Cleaning Function ---
def clean_id_mappings(df):
    df = df.copy()

    # Standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Replace placeholder values
    df.replace(["--", "N/A", ""], pd.NA, inplace=True)

    # Strip all string/object columns
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].astype(str).str.strip()

    return df


# --- Load, Clean, Save ---
id_path = os.path.join(ID_MAP_DIR, "id_mappings.parquet")
id_cleaned_path = os.path.join(ID_MAP_DIR, "id_mappings_cleaned.parquet")

df_ids = pd.read_parquet(id_path)
df_ids_cleaned = clean_id_mappings(df_ids)
df_ids_cleaned.to_parquet(id_cleaned_path, index=False)

print("✅ id_mappings cleaned and saved.")


✅ id_mappings cleaned and saved.


In [14]:
# --- Add project root if needed ---
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from config import SEASONAL_STATS_DIR


# --- Cleaning Function ---
def clean_seasonal_data(df):
    df = df.copy()

    # Standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Replace placeholder values
    df.replace(["--", "N/A", ""], pd.NA, inplace=True)

    # Convert float-to-int where appropriate
    for col in df.select_dtypes(include="float64").columns:
        if df[col].dropna().apply(float.is_integer).all():
            df[col] = df[col].astype("Int64")

    # Clean string columns
    for col in ["player_id", "player_name", "team", "position"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

    # Coerce season to Int64
    if "season" in df.columns:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")

    return df


# --- Load, Clean, Save ---
seasonal_path = os.path.join(SEASONAL_STATS_DIR, "seasonal_stats.parquet")
seasonal_cleaned_path = os.path.join(SEASONAL_STATS_DIR, "seasonal_stats_cleaned.parquet")

df_seasonal = pd.read_parquet(seasonal_path)
df_seasonal_cleaned = clean_seasonal_data(df_seasonal)
df_seasonal_cleaned.to_parquet(seasonal_cleaned_path, index=False)

print("✅ seasonal_stats cleaned and saved.")


✅ seasonal_stats cleaned and saved.


In [15]:
# --- Add project root if needed ---
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from config import ROSTERS_DIR


# --- Cleaning Function ---
def clean_seasonal_rosters(df):
    df = df.copy()

    # Normalize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Replace placeholder strings
    df.replace(["--", "N/A", ""], pd.NA, inplace=True)

    # Coerce numeric columns
    for col in ["jersey_number", "height", "weight"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Parse birth_date safely
    if "birth_date" in df.columns:
        df["birth_date"] = pd.to_datetime(df["birth_date"], errors="coerce")

    # Clean categorical/text fields
    for col in ["player_name", "position", "team", "status"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

    # Coerce season
    if "season" in df.columns:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")

    return df


# --- Load, Clean, Save ---
rosters_path = os.path.join(ROSTERS_DIR, "seasonal_rosters.parquet")
rosters_cleaned_path = os.path.join(ROSTERS_DIR, "seasonal_rosters_cleaned.parquet")

df_rosters = pd.read_parquet(rosters_path)
df_rosters_cleaned = clean_seasonal_rosters(df_rosters)
df_rosters_cleaned.to_parquet(rosters_cleaned_path, index=False)

print("✅ seasonal_rosters cleaned and saved.")


✅ seasonal_rosters cleaned and saved.


In [16]:
# --- Add project root if needed ---
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from config import TEAM_DIR


# --- Cleaning Function ---
def clean_team_info(df):
    df = df.copy()

    # Normalize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Replace placeholder strings
    df.replace(["--", "N/A", ""], pd.NA, inplace=True)

    # Clean text fields
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].astype(str).str.strip()

    # Coerce numeric columns if applicable
    for col in ["season", "team_id"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    return df


# --- Load, Clean, Save ---
team_path = os.path.join(TEAM_DIR, "team_info.parquet")
team_cleaned_path = os.path.join(TEAM_DIR, "team_info_cleaned.parquet")

df_team = pd.read_parquet(team_path)
df_team_cleaned = clean_team_info(df_team)
df_team_cleaned.to_parquet(team_cleaned_path, index=False)

print("✅ team_info cleaned and saved.")


✅ team_info cleaned and saved.


In [17]:
# --- Add project root if needed ---
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from config import WEEKLY_STATS_DIR


# --- Cleaning Function ---
def clean_weekly_data(df):
    df = df.copy()

    # Normalize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # Replace placeholder strings
    df.replace(["--", "N/A", ""], pd.NA, inplace=True)

    # Coerce float-to-int where appropriate
    for col in df.select_dtypes(include="float64").columns:
        if df[col].dropna().apply(float.is_integer).all():
            df[col] = df[col].astype("Int64")

    # Ensure season/week are integers
    for col in ["season", "week"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    # Clean text columns
    for col in ["player_id", "player_name", "team", "opponent", "position"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

    return df


# --- Load, Clean, Save ---
weekly_path = os.path.join(WEEKLY_STATS_DIR, "weekly_stats.parquet")
weekly_cleaned_path = os.path.join(WEEKLY_STATS_DIR, "weekly_stats_cleaned.parquet")

df_weekly = pd.read_parquet(weekly_path)
df_weekly_cleaned = clean_weekly_data(df_weekly)
df_weekly_cleaned.to_parquet(weekly_cleaned_path, index=False)

print("✅ weekly_stats cleaned and saved.")


✅ weekly_stats cleaned and saved.


In [18]:
import os
import pandas as pd
import numpy as np

# Replace this with your actual import if using config.py
from config import PBP_DIR

# Step 1: Gather all .parquet files
pbp_files = sorted([
    os.path.join(PBP_DIR, f)
    for f in os.listdir(PBP_DIR)
    if f.endswith(".parquet")
])

# Step 2: Build a unified column set
column_union = set()
for path in pbp_files:
    df = pd.read_parquet(path)
    column_union.update(df.columns)
column_union = sorted(column_union)

# Step 3: Define cleaner
def clean_pbp(df):
    df = df.copy()
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    df.replace(["--", "N/A", "", "NaN"], pd.NA, inplace=True)

    for col in column_union:
        if col not in df.columns:
            df[col] = pd.NA

    numeric_cols = ["yards_gained", "epa", "down", "quarter", "game_seconds_remaining"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    for col in ["game_id", "play_id", "posteam", "defteam", "play_type"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

    return df[column_union]

# Step 4: Clean and save each season
cleaned_dfs = []
for path in pbp_files:
    season_year = os.path.basename(path).split("_")[1].split(".")[0]
    print(f"🧹 Cleaning {season_year}...")

    df_raw = pd.read_parquet(path)
    df_cleaned = clean_pbp(df_raw)
    df_cleaned["season"] = int(season_year)

    cleaned_path = os.path.join(PBP_DIR, f"pbp_{season_year}_cleaned.parquet")
    df_cleaned.to_parquet(cleaned_path, index=False)
    cleaned_dfs.append(df_cleaned)

# Step 5: Save full combined file (optional)
df_all = pd.concat(cleaned_dfs, ignore_index=True)
df_all.to_parquet(os.path.join(PBP_DIR, "pbp_all_cleaned.parquet"), index=False)

print("✅ All seasons cleaned and saved.")


🧹 Cleaning 1999...
🧹 Cleaning 2000...
🧹 Cleaning 2001...
🧹 Cleaning 2002...
🧹 Cleaning 2003...
🧹 Cleaning 2004...
🧹 Cleaning 2005...
🧹 Cleaning 2006...
🧹 Cleaning 2007...
🧹 Cleaning 2008...
🧹 Cleaning 2009...
🧹 Cleaning 2010...
🧹 Cleaning 2011...
🧹 Cleaning 2012...
🧹 Cleaning 2013...
🧹 Cleaning 2014...
🧹 Cleaning 2015...
🧹 Cleaning 2016...
🧹 Cleaning 2017...
🧹 Cleaning 2018...
🧹 Cleaning 2019...
🧹 Cleaning 2020...
🧹 Cleaning 2021...
🧹 Cleaning 2022...
🧹 Cleaning 2023...
🧹 Cleaning 2024...
✅ All seasons cleaned and saved.


In [21]:
import os
import pandas as pd
from config import (
    COMBINE_DIR, DRAFT_DIR, DRAFT_VALUE_DIR, ID_MAP_DIR,
    SEASONAL_STATS_DIR, ROSTERS_DIR, TEAM_DIR, WEEKLY_STATS_DIR, PBP_DIR, DATA_DIR
)

# Cleaned files to check
cleaned_files = {
    "combine_data": os.path.join(COMBINE_DIR, "combine_data_cleaned.parquet"),
    "draft_picks": os.path.join(DRAFT_DIR, "draft_picks_cleaned.parquet"),
    "draft_values": os.path.join(DRAFT_VALUE_DIR, "draft_values_cleaned.parquet"),
    "id_mappings": os.path.join(ID_MAP_DIR, "id_mappings_cleaned.parquet"),
    "seasonal_stats": os.path.join(SEASONAL_STATS_DIR, "seasonal_stats_cleaned.parquet"),
    "seasonal_rosters": os.path.join(ROSTERS_DIR, "seasonal_rosters_cleaned.parquet"),
    "team_info": os.path.join(TEAM_DIR, "team_info_cleaned.parquet"),
    "weekly_stats": os.path.join(WEEKLY_STATS_DIR, "weekly_stats_cleaned.parquet"),
    "pbp_all": os.path.join(PBP_DIR, "pbp_all_cleaned.parquet")
}

# Run checks
results = []

for name, path in cleaned_files.items():
    if os.path.exists(path):
        df = pd.read_parquet(path)
        row_count = df.shape[0]
        col_count = df.shape[1]

        for col in df.columns:
            dtype = str(df[col].dtype)
            null_pct = df[col].isnull().mean()
            is_unique = df[col].is_unique
            all_nonnull = df[col].notnull().all()
            is_pk = is_unique and all_nonnull

            max_len = (
                df[col].astype(str).str.len().max()
                if df[col].dtype == "object" else None
            )

            results.append({
                "table": name,
                "column": col,
                "dtype": dtype,
                "null_pct": round(null_pct, 3),
                "max_text_len": max_len,
                "is_unique": is_unique,
                "all_nonnull": all_nonnull,
                "pk_candidate": is_pk,
                "row_count": row_count,
                "col_count": col_count
            })
    else:
        print(f"⚠️ File not found: {name}")

# Export
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(DATA_DIR, "sql_compatibility_summary.csv"), index=False)

print("✅ SQL compatibility summary saved to: data/sql_compatibility_summary.csv")




✅ SQL compatibility summary saved to: data/sql_compatibility_summary.csv


In [22]:
import os

root = r"C:\Repos\NFL_Draft\data"
for dirpath, _, filenames in os.walk(root):
    for file in filenames:
        if file.endswith(".parquet"):
            print(os.path.join(dirpath, file))


C:\Repos\NFL_Draft\data\combine_data\combine_data.parquet
C:\Repos\NFL_Draft\data\combine_data\combine_data_cleaned.parquet
C:\Repos\NFL_Draft\data\draft_picks\draft_picks.parquet
C:\Repos\NFL_Draft\data\draft_picks\draft_picks_cleaned.parquet
C:\Repos\NFL_Draft\data\draft_values\draft_values.parquet
C:\Repos\NFL_Draft\data\draft_values\draft_values_cleaned.parquet
C:\Repos\NFL_Draft\data\id_mappings\id_mappings.parquet
C:\Repos\NFL_Draft\data\id_mappings\id_mappings_cleaned.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pbp_1999.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pbp_1999_cleaned.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pbp_2000.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pbp_2000_cleaned.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pbp_2001.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pbp_2001_cleaned.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pbp_2002.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pbp_2002_cleaned.parquet
C:\Repos\NFL_Draft\data\pbp_by_season\pb

In [40]:
import os
import pandas as pd
from sqlalchemy import create_engine

# ---------------------------
# 1. SQL Server Connection
# ---------------------------
conn_str = (
    "mssql+pyodbc://@RAMSEY_BOLTON\\SQLEXPRESS/NFL_Analytics"
    "?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server"
)
engine = create_engine(conn_str, fast_executemany=True)

# ---------------------------
# 2. Cleaning Helpers
# ---------------------------
def truncate_string_columns(df, max_len=510):
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].astype(str).str.slice(0, max_len)
    return df

def clean_and_round_float_columns(df, decimal_places=6):
    float_cols = df.select_dtypes(include='number').columns
    for col in float_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').round(decimal_places)
    return df

def coerce_nulls(df):
    return df.where(pd.notnull(df), None)

def clean_seasonal_stats(df):
    df = truncate_string_columns(df)
    df = clean_and_round_float_columns(df, decimal_places=6)
    
    # Clip float range to avoid SQL precision/scale issues
    float_cols = df.select_dtypes(include='number').columns
    for col in float_cols:
        df[col] = df[col].mask((df[col] > 1e5) | (df[col] < -1e5), None)
    
    return coerce_nulls(df)

# ---------------------------
# 3. File Paths
# ---------------------------
parquet_paths = {
    "combine_data": r"C:\Repos\NFL_Draft\data\combine_data\combine_data_cleaned.parquet",
    "draft_picks": r"C:\Repos\NFL_Draft\data\draft_picks\draft_picks_cleaned.parquet",
    "draft_values": r"C:\Repos\NFL_Draft\data\draft_values\draft_values_cleaned.parquet",
    "id_mappings": r"C:\Repos\NFL_Draft\data\id_mappings\id_mappings_cleaned.parquet",
    "seasonal_stats": r"C:\Repos\NFL_Draft\data\seasonal_data\seasonal_stats_cleaned.parquet",
    "seasonal_rosters": r"C:\Repos\NFL_Draft\data\seasonal_rosters\seasonal_rosters_cleaned.parquet",
    "team_info": r"C:\Repos\NFL_Draft\data\team_info\team_info_cleaned.parquet",
    "weekly_stats": r"C:\Repos\NFL_Draft\data\weekly_data\weekly_stats_cleaned.parquet"
}

batch_tables = {"seasonal_stats", "seasonal_rosters", "weekly_stats"}

# ---------------------------
# 4. Load Logic
# ---------------------------
def load_parquet_to_sql(table_name, file_path, default_chunksize=10000):
    print(f"🔄 Loading {table_name}...")

    try:
        df = pd.read_parquet(file_path)

        if table_name == "seasonal_stats":
            df = clean_seasonal_stats(df)
        else:
            df = truncate_string_columns(df)
            df = clean_and_round_float_columns(df)
            df = coerce_nulls(df)

        chunksize = default_chunksize if table_name in batch_tables else None

        df.to_sql(
            table_name,
            con=engine,
            if_exists="append",
            index=False,
            chunksize=chunksize
        )

        print(f"✅ {table_name}: {len(df):,} rows inserted.")

    except Exception as e:
        print(f"❌ Failed to insert {table_name} → {e}")

# ---------------------------
# 5. Run Loader
# ---------------------------
if __name__ == "__main__":
    for table, path in parquet_paths.items():
        load_parquet_to_sql(table, path)




🔄 Loading combine_data...
✅ combine_data: 8,649 rows inserted.
🔄 Loading draft_picks...
✅ draft_picks: 6,640 rows inserted.
🔄 Loading draft_values...
✅ draft_values: 262 rows inserted.
🔄 Loading id_mappings...
✅ id_mappings: 12,023 rows inserted.
🔄 Loading seasonal_stats...
✅ seasonal_stats: 15,102 rows inserted.
🔄 Loading seasonal_rosters...
✅ seasonal_rosters: 63,323 rows inserted.
🔄 Loading team_info...
✅ team_info: 36 rows inserted.
🔄 Loading weekly_stats...
✅ weekly_stats: 134,470 rows inserted.


In [45]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from IPython.display import display

# --- Connection ---
conn_str = (
    "mssql+pyodbc://@RAMSEY_BOLTON\\SQLEXPRESS/NFL_Analytics"
    "?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server"
)
engine = create_engine(conn_str)
inspector = inspect(engine)

# --- Initialize ---
results = []

def log(check_type, table, column, passed, detail=""):
    results.append({
        "Check": check_type,
        "Table": table,
        "Column": column,
        "Passed": passed,
        "Detail": detail
    })

# --- A. Table + Column Existence ---
expected_columns = {
    "combine_data": [],
    "draft_picks": [],
    "draft_values": [],
    "id_mappings": ["player_id"],
    "team_info": ["team_id", "team_name"],
    "seasonal_stats": ["player_id", "season"],
    "weekly_stats": ["player_id", "season"],
    "seasonal_rosters": ["player_id", "team_id"]
}

for table, cols in expected_columns.items():
    if not inspector.has_table(table):
        log("Table exists", table, "", False, "Missing")
        continue
    log("Table exists", table, "", True, "✓ Found")
    actual_cols = [col["name"] for col in inspector.get_columns(table)]
    for col in cols:
        log("Column exists", table, col, col in actual_cols)

# --- B. PK Uniqueness (Assumed) ---
pk_checks = {
    "id_mappings": "player_id",
    "team_info": "team_id"
}

for table, col in pk_checks.items():
    if inspector.has_table(table):
        try:
            query = f"SELECT COUNT(*) AS dupes FROM (SELECT {col} FROM {table} GROUP BY {col} HAVING COUNT(*) > 1) AS sub"
            dupes = pd.read_sql(query, engine).iloc[0]["dupes"]
            log("PK uniqueness", table, col, dupes == 0, f"{dupes} duplicates" if dupes else "✓ Unique")
        except Exception as e:
            log("PK uniqueness", table, col, False, f"Error: {e}")

# --- C. Foreign Key Relationships ---
fk_checks = [
    ("seasonal_stats", "player_id", "id_mappings"),
    ("seasonal_rosters", "player_id", "id_mappings"),
    ("weekly_stats", "player_id", "id_mappings"),
    ("seasonal_rosters", "team_id", "team_info"),
    ("weekly_stats", "team_id", "team_info"),
]

for src, col, tgt in fk_checks:
    try:
        query = f"""
        SELECT COUNT(*) AS missing
        FROM {src}
        WHERE {col} IS NOT NULL
        AND {col} NOT IN (SELECT DISTINCT {col} FROM {tgt})
        """
        missing = pd.read_sql(query, engine).iloc[0]["missing"]
        log("FK integrity", src, col, missing == 0, f"{missing} unmatched" if missing else "✓ OK")
    except Exception as e:
        log("FK integrity", src, col, False, f"Error: {e}")

# --- D. Null Checks ---
required_fields = {
    "id_mappings": ["player_id"],
    "team_info": ["team_id", "team_name"],
    "seasonal_stats": ["player_id", "season"],
    "weekly_stats": ["player_id", "season"]
}

for table, cols in required_fields.items():
    for col in cols:
        try:
            nulls = pd.read_sql(f"SELECT COUNT(*) AS n FROM {table} WHERE {col} IS NULL", engine).iloc[0]["n"]
            log("NULL check", table, col, nulls == 0, f"{nulls} NULL(s)" if nulls else "✓ OK")
        except Exception as e:
            log("NULL check", table, col, False, f"Error: {e}")

# --- Export + Display ---
df_result = pd.DataFrame(results)
csv_path = r"C:\Repos\NFL_Draft\data\database_verification_summary.csv"
df_result.to_csv(csv_path, index=False)
display(df_result)
print(f"\n✅ Saved verification summary to:\n{csv_path}")




Unnamed: 0,Check,Table,Column,Passed,Detail
0,Table exists,combine_data,,True,✓ Found
1,Table exists,draft_picks,,True,✓ Found
2,Table exists,draft_values,,True,✓ Found
3,Table exists,id_mappings,,True,✓ Found
4,Column exists,id_mappings,player_id,False,
5,Table exists,team_info,,True,✓ Found
6,Column exists,team_info,team_id,True,
7,Column exists,team_info,team_name,True,
8,Table exists,seasonal_stats,,True,✓ Found
9,Column exists,seasonal_stats,player_id,True,



✅ Saved verification summary to:
C:\Repos\NFL_Draft\data\database_verification_summary.csv
