In [26]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import pyodbc

# Set Export Directory
export_dir = r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer"
os.makedirs(export_dir, exist_ok=True)

# Define a dictionary to log changes
change_logs = {}

# Function to log changes
def log_change(dataset_name, description):
    if dataset_name not in change_logs:
        change_logs[dataset_name] = []
    change_logs[dataset_name].append(f"{datetime.now()} - {description}")

# Function to handle duplicate rows
def handle_duplicate_rows(df, dataset_name):
    problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
    if problematic_rows.any():
        count = problematic_rows.sum()
        log_change(dataset_name, f"Detected {count} rows with unhashable types. Dropping these rows.")
        df = df[~problematic_rows]

    duplicate_count = df.duplicated().sum()
    log_change(dataset_name, f"Detected {duplicate_count} duplicate rows.")
    df = df.drop_duplicates()
    log_change(dataset_name, "Removed duplicate rows.")
    return df

# Function to handle duplicate columns
def handle_duplicate_columns(df, dataset_name):
    df = df.loc[:, ~df.columns.duplicated()]
    log_change(dataset_name, "Removed duplicate columns.")
    return df

# Function to drop unwanted columns
def drop_unwanted_columns(df, unwanted_columns, dataset_name):
    df = df.drop(columns=[col for col in unwanted_columns if col in df.columns], errors="ignore")
    log_change(dataset_name, f"Dropped unwanted columns: {unwanted_columns}")
    return df

# Function to validate and cast data for SQL upload
def validate_and_cast_data(df):
    for col in df.columns:
        if df[col].dtype == "float64":
            df[col] = df[col].apply(lambda x: round(x, 6) if pd.notnull(x) and np.isfinite(x) else None)
        elif df[col].dtype == "object":
            df[col] = df[col].apply(lambda x: str(x) if isinstance(x, (str, int, float)) else None)
    return df

# Updated clean_dataset function to ensure specific columns are numeric
def clean_dataset(df, dataset_name):
    # Identify and drop unwanted columns
    unwanted_columns = ["extra_time", "penalties", "home_penalties", "away_penalties", "height_ft", "height_in", "weight_lb"]
    unwanted_columns += [col for col in df.columns if "outlier" in col]
    df = drop_unwanted_columns(df, unwanted_columns, dataset_name)

    # Replace NaN in numeric columns with None (SQL NULL)
    numeric_cols = df.select_dtypes(include=["float64"]).columns
    df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)

    # Ensure salary columns are numeric
    if "base_salary" in df.columns:
        df["base_salary"] = pd.to_numeric(df["base_salary"], errors="coerce")
    if "guaranteed_compensation" in df.columns:
        df["guaranteed_compensation"] = pd.to_numeric(df["guaranteed_compensation"], errors="coerce")

    # Ensure specific stats columns are numeric
    stats_columns = [
        "minutes_played", "shots", "shots_on_target", "goals", "xgoals",
        "xplace", "goals_minus_xgoals", "key_passes", "primary_assists",
        "xassists", "primary_assists_minus_xassists", "xgoals_plus_xassists",
        "points_added", "xpoints_added"
    ]
    for col in stats_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")  # Convert to numeric, set invalids to NaN

    # Ensure all remaining columns are valid
    df = validate_and_cast_data(df)

    return df

# Function to dynamically set SQL data types
def get_sql_data_type(dtype):
    if pd.api.types.is_integer_dtype(dtype):
        return "INT"
    elif pd.api.types.is_float_dtype(dtype):
        return "FLOAT"
    elif pd.api.types.is_object_dtype(dtype):
        return "NVARCHAR(MAX)"
    elif pd.api.types.is_bool_dtype(dtype):
        return "BIT"
    else:
        return "NVARCHAR(MAX)"

# Function to upload datasets to SQL Server
def upload_to_sql(df, table_name, server, database):
    connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;"
    conn = pyodbc.connect(connection_string)
    cursor = conn.cursor()

    # Drop table if it exists
    cursor.execute(f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}")
    conn.commit()

    # Dynamically create table with appropriate data types
    create_query = f"CREATE TABLE {table_name} ({', '.join([f'[{col}] {get_sql_data_type(df[col].dtype)}' for col in df.columns])})"
    cursor.execute(create_query)
    conn.commit()

    # Insert data into the table
    for idx, row in df.iterrows():
        placeholders = ", ".join(["?" for _ in row])
        insert_query = f"INSERT INTO {table_name} VALUES ({placeholders})"
        try:
            cursor.execute(insert_query, *row)
        except Exception as e:
            print(f"Error inserting row {idx}: {row.to_dict()}")
            print(f"Error: {e}")
            continue
    conn.commit()
    conn.close()
    print(f"Data uploaded to SQL Server table: {table_name}")

# === Fetch All Datasets ===
dataset_paths = {
    "team_names": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_team_names.csv",
    "player_names": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_player_names.csv",
    "match_data": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_match_data.csv",
    "player_stats": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_player_stats.csv",
    "player_salaries": r"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\AU_processed_player_salaries.csv",
}

processed_datasets = {}
for dataset_name, path in dataset_paths.items():
    print(f"\nProcessing {dataset_name}...")
    df = pd.read_csv(path)
    df = handle_duplicate_rows(df, dataset_name)
    df = handle_duplicate_columns(df, dataset_name)
    df = clean_dataset(df, dataset_name)
    processed_datasets[dataset_name] = df

# === Save and Upload Datasets ===
for dataset_name, df in processed_datasets.items():
    table_name = f"AU_{dataset_name}"
    upload_to_sql(df, table_name, "RAMSEY_BOLTON\\SQLEXPRESS", "SoccerProjects")

# === Process and Upload Change Logs ===
print("\nProcessing and Uploading Change Logs...")

change_logs_list = []
for dataset_name, logs in change_logs.items():
    for log in logs:
        truncated_log = log[:4000] if len(log) > 4000 else log
        change_logs_list.append({"Dataset": dataset_name, "LogEntry": truncated_log})

change_logs_df = pd.DataFrame(change_logs_list)

if not change_logs_df.empty:
    change_logs_df = validate_and_cast_data(change_logs_df)
    change_logs_path = os.path.join(export_dir, "AU_change_logs.csv")
    change_logs_df.to_csv(change_logs_path, index=False)
    print(f"Change logs saved to {change_logs_path}")

    upload_to_sql(change_logs_df, "AU_ChangeLogs", "RAMSEY_BOLTON\\SQLEXPRESS", "SoccerProjects")
    print("Change logs uploaded to SQL Server.")
else:
    print("No changes were logged during this run.")



Processing team_names...

Processing player_names...

Processing match_data...

Processing player_stats...

Processing player_salaries...


  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)
  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)
  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)
  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)
  problematic_rows = df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=1)
  df[numeric_cols] = df[numeric_cols].applymap(lambda x: None if pd.isna(x) else x)


Data uploaded to SQL Server table: AU_team_names
Data uploaded to SQL Server table: AU_player_names
Data uploaded to SQL Server table: AU_match_data
Data uploaded to SQL Server table: AU_player_stats
Data uploaded to SQL Server table: AU_player_salaries

Processing and Uploading Change Logs...
Change logs saved to C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\AU_change_logs.csv
Data uploaded to SQL Server table: AU_ChangeLogs
Change logs uploaded to SQL Server.


In [27]:
import plotly.express as px

# Database connection
server = "RAMSEY_BOLTON\\SQLEXPRESS"
database = "SoccerProjects"
connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;"

# SQL Query
query = """
WITH PlayerAges AS (
    SELECT 
        player_id,
        player_name
    FROM 
        SoccerProjects.dbo.AU_player_names
),
DistinctPlayerSalaries AS (
    SELECT 
        ps.player_id,
        CAST(ps.guaranteed_compensation AS DECIMAL(15, 2)) AS guaranteed_compensation,
        ROW_NUMBER() OVER (PARTITION BY ps.player_id ORDER BY ps.season_name DESC) AS row_num
    FROM 
        SoccerProjects.dbo.AU_player_salaries ps
    WHERE 
        ps.season_name = 2024
),
DistinctPlayerStats AS (
    SELECT 
        ps.player_id,
        ps.xpoints_added,
        ROW_NUMBER() OVER (PARTITION BY ps.player_id ORDER BY ps.season_name DESC) AS row_num
    FROM 
        SoccerProjects.dbo.AU_player_stats ps
    WHERE 
        ps.season_name = 2024
)
SELECT 
    pa.player_name,
    s.guaranteed_compensation,
    ps.xpoints_added
FROM 
    PlayerAges pa
LEFT JOIN 
    DistinctPlayerSalaries s
    ON pa.player_id = s.player_id AND s.row_num = 1
LEFT JOIN 
    DistinctPlayerStats ps
    ON pa.player_id = ps.player_id AND ps.row_num = 1
WHERE 
    s.guaranteed_compensation IS NOT NULL AND ps.xpoints_added IS NOT NULL
ORDER BY 
    ps.xpoints_added DESC;
"""

# Execute query
with pyodbc.connect(connection_string) as conn:
    df = pd.read_sql(query, conn)

# Visualize with Plotly
fig = px.scatter(
    df,
    x="guaranteed_compensation",
    y="xpoints_added",
    hover_name="player_name",
    title="Player Value Per Dollar",
    labels={
        "guaranteed_compensation": "Guaranteed Compensation ($)",
        "xpoints_added": "Expected Points Added"
    }
)

# Enhance layout
fig.update_layout(
    xaxis_title="Guaranteed Compensation ($)",
    yaxis_title="Expected Points Added",
    template="plotly_white",
    title_font_size=20,
    hoverlabel=dict(font_size=12)
)

# Show plot
fig.show()


  df = pd.read_sql(query, conn)
