In [24]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import pyodbc

# Set Export Directory
export_dir = r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer"
os.makedirs(export_dir, exist_ok=True)

# Define a dictionary to log changes
change_logs = {}

# Function to log changes
def log_change(dataset_name, description):
    if dataset_name not in change_logs:
        change_logs[dataset_name] = []
    change_logs[dataset_name].append(f"{datetime.now()} - {description}")

# Function to handle duplicate rows
def handle_duplicate_rows(df, dataset_name):
    problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
    if problematic_rows.any():
        count = problematic_rows.sum()
        log_change(dataset_name, f"Detected {count} rows with unhashable types. Dropping these rows.")
        df = df[~problematic_rows]

    duplicate_count = df.duplicated().sum()
    log_change(dataset_name, f"Detected {duplicate_count} duplicate rows.")
    df = df.drop_duplicates()
    log_change(dataset_name, "Removed duplicate rows.")
    return df

# Function to handle duplicate columns
def handle_duplicate_columns(df, dataset_name):
    df = df.loc[:, ~df.columns.duplicated()]
    log_change(dataset_name, "Removed duplicate columns.")
    return df

# Function to drop unwanted columns
def drop_unwanted_columns(df, unwanted_columns, dataset_name):
    df = df.drop(columns=[col for col in unwanted_columns if col in df.columns], errors="ignore")
    log_change(dataset_name, f"Dropped unwanted columns: {unwanted_columns}")
    return df

# Clean dataset function
def clean_dataset(df, dataset_name):
    # Identify and drop unwanted columns
    unwanted_columns = ["extra_time", "penalties", "home_penalties", "away_penalties", "height_ft", "height_in", "weight_lb"]
    unwanted_columns += [col for col in df.columns if "outlier" in col]
    df = drop_unwanted_columns(df, unwanted_columns, dataset_name)

    # Replace NaN in numeric columns with None (SQL NULL)
    numeric_cols = df.select_dtypes(include=["float64"]).columns
    df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)

    # Ensure all remaining columns are valid
    df = validate_and_cast_data(df)

    return df

# Function to validate and cast data for SQL upload
def validate_and_cast_data(df):
    for col in df.columns:
        if df[col].dtype == "float64":
            df[col] = df[col].apply(lambda x: round(x, 6) if pd.notnull(x) and np.isfinite(x) else None)
        elif df[col].dtype == "object":
            df[col] = df[col].apply(lambda x: str(x) if isinstance(x, (str, int, float)) else None)
    return df

# Function to upload datasets to SQL Server
def upload_to_sql(df, table_name, server, database):
    connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;"
    conn = pyodbc.connect(connection_string)
    cursor = conn.cursor()

    cursor.execute(f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}")
    conn.commit()
    create_query = f"CREATE TABLE {table_name} ({', '.join([f'[{col}] NVARCHAR(MAX)' for col in df.columns])})"
    cursor.execute(create_query)
    conn.commit()

    for idx, row in df.iterrows():
        placeholders = ", ".join(["?" for _ in row])
        insert_query = f"INSERT INTO {table_name} VALUES ({placeholders})"
        try:
            cursor.execute(insert_query, *row)
        except Exception as e:
            print(f"Error inserting row {idx}: {row.to_dict()}")
            print(f"Error: {e}")
            continue
    conn.commit()
    conn.close()
    print(f"Data uploaded to SQL Server table: {table_name}")

# === Fetch All Datasets ===
dataset_paths = {
    "team_names": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_team_names.csv",
    "player_names": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_player_names.csv",
    "match_data": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_match_data.csv",
    "player_stats": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_player_stats.csv",
    "player_salaries": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_player_salaries.csv",
}

processed_datasets = {}
for dataset_name, path in dataset_paths.items():
    print(f"\nProcessing {dataset_name}...")
    df = pd.read_csv(path)
    df = handle_duplicate_rows(df, dataset_name)
    df = handle_duplicate_columns(df, dataset_name)
    df = clean_dataset(df, dataset_name)
    processed_datasets[dataset_name] = df

# === Save and Upload Datasets ===
for dataset_name, df in processed_datasets.items():
    table_name = f"AU_{dataset_name}"
    upload_to_sql(df, table_name, "RAMSEY_BOLTON\\SQLEXPRESS", "SoccerProjects")

# === Process and Upload Change Logs ===
print("\nProcessing and Uploading Change Logs...")

change_logs_list = []
for dataset_name, logs in change_logs.items():
    for log in logs:
        truncated_log = log[:4000] if len(log) > 4000 else log
        change_logs_list.append({"Dataset": dataset_name, "LogEntry": truncated_log})

change_logs_df = pd.DataFrame(change_logs_list)

if not change_logs_df.empty:
    change_logs_df = validate_and_cast_data(change_logs_df)
    change_logs_path = os.path.join(export_dir, "AU_change_logs.csv")
    change_logs_df.to_csv(change_logs_path, index=False)
    print(f"Change logs saved to {change_logs_path}")

    upload_to_sql(change_logs_df, "AU_ChangeLogs", "RAMSEY_BOLTON\\SQLEXPRESS", "SoccerProjects")
    print("Change logs uploaded to SQL Server.")
else:
    print("No changes were logged during this run.")




Processing team_names...

Processing player_names...

Processing match_data...

Processing player_stats...


  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)
  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)
  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)
  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)
  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)



Processing player_salaries...


  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)


Data uploaded to SQL Server table: AU_team_names
Data uploaded to SQL Server table: AU_player_names
Data uploaded to SQL Server table: AU_match_data
Data uploaded to SQL Server table: AU_player_stats
Data uploaded to SQL Server table: AU_player_salaries

Processing and Uploading Change Logs...
Change logs saved to C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\AU_change_logs.csv
Data uploaded to SQL Server table: AU_ChangeLogs
Change logs uploaded to SQL Server.
