In [42]:
# Import Libraries
from itscalledsoccer.client import AmericanSoccerAnalysis
import pandas as pd
import os

# Initialize ASA Client
asa_client = AmericanSoccerAnalysis()

# Set Export Directory
export_dir = r"C:\Users\denos\OneDrive\Projects\AU_Soccer"
os.makedirs(export_dir, exist_ok=True)  # Ensure the directory exists

# === Fetch Team Names ===
try:
    print("\n=== Fetching Team Names ===")
    teams = asa_client.get_teams(leagues="mls")
    team_names_df = pd.DataFrame(teams)
    team_names_csv_path = os.path.join(export_dir, "team_names.csv")
    team_names_df.to_csv(team_names_csv_path, index=False)
    print(f"Team Names saved to: {team_names_csv_path}")
except Exception as e:
    print("Error fetching team names:", e)

# === Fetch Player Names ===
try:
    print("\n=== Fetching Player Names ===")
    players = asa_client.get_players(leagues="mls")
    player_names_df = pd.DataFrame(players)
    player_names_csv_path = os.path.join(export_dir, "player_names.csv")
    player_names_df.to_csv(player_names_csv_path, index=False)
    print(f"Player Names saved to: {player_names_csv_path}")
except Exception as e:
    print("Error fetching player names:", e)

# === Match Information Dataset ===
try:
    print("\n=== Fetching Match Data ===")
    matches = asa_client.get_games()
    match_df = pd.DataFrame(matches)
    match_columns = ['game_id', 'date_time_utc', 'home_score', 'away_score', 
                     'home_team_id', 'away_team_id', 'season_name', 'attendance']
    match_df = match_df[[col for col in match_columns if col in match_df.columns]]

    # Enrich Match Data with Team Names
    if 'team_id' in team_names_df.columns:
        match_df = match_df.merge(team_names_df[['team_id', 'team_name']], 
                                  left_on='home_team_id', right_on='team_id', how='left').rename(columns={'team_name': 'home_team_name'})
        match_df = match_df.merge(team_names_df[['team_id', 'team_name']], 
                                  left_on='away_team_id', right_on='team_id', how='left').rename(columns={'team_name': 'away_team_name'})

    match_csv_path = os.path.join(export_dir, "enriched_match_information.csv")
    match_df.to_csv(match_csv_path, index=False)
    print(f"Enriched Match Information saved to: {match_csv_path}")
except Exception as e:
    print("Error fetching match data:", e)

# === Player Information Dataset (with Salaries) ===
try:
    print("\n=== Fetching Player Salaries ===")
    player_salaries = asa_client.get_player_salaries(leagues="mls")
    player_columns = ['player_id', 'team_id', 'season_name', 'position', 
                      'base_salary', 'guaranteed_compensation', 'mlspa_release']
    player_info_df = player_salaries[[col for col in player_columns if col in player_salaries.columns]]

    # Enrich Player Information with Player Names
    if 'player_id' in player_info_df.columns and 'player_id' in player_names_df.columns:
        player_info_df = player_info_df.merge(player_names_df[['player_id', 'player_name']], on='player_id', how='left')

    player_info_csv_path = os.path.join(export_dir, "enriched_player_information.csv")
    player_info_df.to_csv(player_info_csv_path, index=False)
    print(f"Enriched Player Information saved to: {player_info_csv_path}")
except Exception as e:
    print("Error fetching player salary data:", e)

# === Team Information Dataset (with Salaries) ===
try:
    print("\n=== Fetching Team Salaries ===")
    team_salaries = asa_client.get_team_salaries(leagues="mls", split_by_seasons=True)
    team_info_csv_path = os.path.join(export_dir, "team_information.csv")
    team_salaries.to_csv(team_info_csv_path, index=False)
    print(f"Team Information saved to: {team_info_csv_path}")
except Exception as e:
    print("Error fetching team salary data:", e)

# === Player Statistics Dataset ===
try:
    print("\n=== Fetching Player Statistics ===")
    player_stats = asa_client.get_player_xgoals(leagues="mls", split_by_seasons=True)
    player_stats_df = pd.DataFrame(player_stats)
    player_stats_csv_path = os.path.join(export_dir, "player_statistics.csv")
    player_stats_df.to_csv(player_stats_csv_path, index=False)
    print(f"Player Statistics saved to: {player_stats_csv_path}")
except Exception as e:
    print("Error fetching player statistics:", e)


=== Fetching Team Names ===
Team Names saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\team_names.csv

=== Fetching Player Names ===
Player Names saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\player_names.csv

=== Fetching Match Data ===
Enriched Match Information saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\enriched_match_information.csv

=== Fetching Player Salaries ===
Enriched Player Information saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\enriched_player_information.csv

=== Fetching Team Salaries ===
Team Information saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\team_information.csv

=== Fetching Player Statistics ===
Player Statistics saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\player_statistics.csv


In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import zscore
from datetime import datetime

# Set Export Directory
export_dir = r"C:\Users\denos\OneDrive\Projects\AU_Soccer"
os.makedirs(export_dir, exist_ok=True)

# Define a dictionary to log changes for each dataset
change_logs = {}

# Function to log changes
def log_change(dataset_name, description):
    if dataset_name not in change_logs:
        change_logs[dataset_name] = []
    change_logs[dataset_name].append(f"{datetime.now()} - {description}")

# Function to handle duplicate rows
def handle_duplicate_rows(df, dataset_name):
    duplicate_count = df.duplicated().sum()
    log_change(dataset_name, f"Detected {duplicate_count} duplicate rows.")
    df = df.drop_duplicates()
    log_change(dataset_name, "Removed duplicate rows.")
    return df

# Function to handle duplicate columns
def handle_duplicate_columns(df, dataset_name):
    duplicate_columns = []
    for i, col1 in enumerate(df.columns):
        for j, col2 in enumerate(df.columns):
            if i < j and df[col1].equals(df[col2]):
                duplicate_columns.append((col1, col2))
    log_change(dataset_name, f"Detected {len(duplicate_columns)} duplicate columns: {duplicate_columns}")
    df = df.loc[:, ~df.columns.duplicated()]
    log_change(dataset_name, "Removed duplicate columns.")
    return df

# Function to handle outliers
def handle_outliers(df, dataset_name, numeric_cols, z_threshold=3):
    z_scores = df[numeric_cols].apply(zscore)
    for col in numeric_cols:
        outlier_col = f"{col}_outlier"
        df[outlier_col] = (z_scores[col].abs() > z_threshold)
        log_change(dataset_name, f"Detected outliers in column '{col}': {df[outlier_col].sum()} flagged as outliers.")
    return df

# Function to calculate player ages
def calculate_player_ages(player_names_df):
    player_names_df['birth_date'] = pd.to_datetime(player_names_df['birth_date'])
    player_names_df['age'] = (datetime.now() - player_names_df['birth_date']).dt.days // 365
    return player_names_df

# Dataset paths
datasets = {
    "player_statistics": os.path.join(export_dir, "player_statistics.csv"),
    "enriched_player_information": os.path.join(export_dir, "enriched_player_information.csv"),
    "enriched_match_information": os.path.join(export_dir, "enriched_match_information.csv"),
    "player_names": os.path.join(export_dir, "player_names.csv"),
}

processed_datasets = {}
for dataset_name, file_path in datasets.items():
    try:
        # Load dataset
        print(f"Processing {dataset_name}...")
        df = pd.read_csv(file_path)

        # Remove duplicate rows and columns
        df = handle_duplicate_rows(df, dataset_name)
        df = handle_duplicate_columns(df, dataset_name)

        # Apply outlier handling where specified
        if dataset_name == "player_statistics":
            numeric_cols = df.select_dtypes(include="number").columns.tolist()
            df = handle_outliers(df, dataset_name, numeric_cols)
        elif dataset_name == "enriched_player_information":
            numeric_cols = ["base_salary", "guaranteed_compensation"]
            df = handle_outliers(df, dataset_name, numeric_cols)
        elif dataset_name == "enriched_match_information":
            numeric_cols = ["home_score", "away_score"]
            df = handle_outliers(df, dataset_name, numeric_cols)

        # Calculate player ages for player_names dataset
        if dataset_name == "player_names":
            df = calculate_player_ages(df)

        # Save the processed dataset
        output_path = os.path.join(export_dir, f"processed_{dataset_name}.csv")
        df.to_csv(output_path, index=False)
        processed_datasets[dataset_name] = df
        log_change(dataset_name, f"Processed dataset saved to {output_path}.")
    except Exception as e:
        log_change(dataset_name, f"Error processing dataset: {e}")
        print(f"Error processing {dataset_name}: {e}")

# Save change logs
log_path = os.path.join(export_dir, "change_logs.txt")
with open(log_path, "w") as f:
    for dataset_name, logs in change_logs.items():
        f.write(f"=== Changes for {dataset_name} ===\n")
        f.write("\n".join(logs))
        f.write("\n\n")

print("\nProcessing completed. Logs saved to:", log_path)

In [45]:
# Set Export Directory
export_dir = r"C:\Users\denos\OneDrive\Projects\AU_Soccer"
os.makedirs(export_dir, exist_ok=True)

# Filenames for processed datasets
datasets = {
    "processed_enriched_match_information": os.path.join(export_dir, "processed_enriched_match_information.csv"),
    "processed_enriched_player_information": os.path.join(export_dir, "processed_enriched_player_information.csv"),
    "processed_player_statistics": os.path.join(export_dir, "processed_player_statistics.csv"),
}

# Create filtered datasets for 2024
filtered_datasets = {}
for dataset_name, file_path in datasets.items():
    try:
        print(f"Processing {dataset_name} for 2024 data...")
        df = pd.read_csv(file_path)
        
        # Ensure season_name is treated as numeric and filter for 2024
        if "season_name" in df.columns:
            df["season_name"] = pd.to_numeric(df["season_name"], errors="coerce")
            filtered_df = df[df["season_name"] == 2024]
            filtered_datasets[dataset_name] = filtered_df
            
            # Save the filtered dataset
            output_path = os.path.join(export_dir, f"{dataset_name}_2024.csv")
            filtered_df.to_csv(output_path, index=False)
            print(f"Filtered 2024 data saved to: {output_path}")
        else:
            print(f"'season_name' column not found in {dataset_name}. Skipping...")
    except Exception as e:
        print(f"Error processing {dataset_name}: {e}")

# Summary
print("\nFiltered datasets for 2024:")
for dataset_name, df in filtered_datasets.items():
    print(f"  - {dataset_name}: {len(df)} rows")


Processing processed_enriched_match_information for 2024 data...
Filtered 2024 data saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\processed_enriched_match_information_2024.csv
Processing processed_enriched_player_information for 2024 data...
Filtered 2024 data saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\processed_enriched_player_information_2024.csv
Processing processed_player_statistics for 2024 data...
Filtered 2024 data saved to: C:\Users\denos\OneDrive\Projects\AU_Soccer\processed_player_statistics_2024.csv

Filtered datasets for 2024:
  - processed_enriched_match_information: 190 rows
  - processed_enriched_player_information: 1786 rows
  - processed_player_statistics: 801 rows


In [56]:
import pandas as pd
import os

# Set file paths
file_paths = {
    "processed_enriched_match_information_2024.csv": "MatchInfo",
    "processed_enriched_player_information_2024.csv": "PlayerSalary",
    "processed_player_names.csv": "PlayerInfo"
}

# Set log directory
log_dir = r"C:\Users\denos\OneDrive\Projects\AU_Soccer\logs"
os.makedirs(log_dir, exist_ok=True)

# Function to validate and clean non-numeric columns
def validate_and_clean_non_numeric_columns(df, dataset_name):
    issue_log = []

    # Iterate over non-numeric columns
    for col in df.select_dtypes(include=[object]).columns:
        original_count = len(df)

        # Handle empty strings and NaN
        empty_strings = df[col] == ""
        if empty_strings.sum() > 0:
            issue_log.append(f"{col}: {empty_strings.sum()} empty string values found")
            df[col] = df[col].replace("", pd.NA)  # Replace empty strings with NaN

        # Handle NaN values
        nan_values = df[col].isna().sum()
        if nan_values > 0:
            issue_log.append(f"{col}: {nan_values} NaN values found")

        # Normalize specific columns (e.g., player names)
        if col == "player_name":  # Example: Normalize player names with special characters
            df[col] = df[col].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

        # Check for unusual formats in season_name (e.g., empty sets or lists)
        if col == "season_name":
            df[col] = df[col].apply(lambda x: str(x) if isinstance(x, str) else "Invalid")

    # Log detected issues
    if issue_log:
        with open(os.path.join(log_dir, f"{dataset_name}_non_numeric_issues.log"), "w") as log_file:
            log_file.write("\n".join(issue_log))
        print(f"Issues detected and logged for {dataset_name}")
    else:
        print(f"No issues detected in non-numeric columns for {dataset_name}")

    return df

# Function to clean specific datasets
def clean_and_save_non_numeric_dataset(file_path, dataset_name):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Clean non-numeric columns and log issues
    df = validate_and_clean_non_numeric_columns(df, dataset_name)

    # Save the cleaned dataset
    cleaned_file_path = file_path.replace(".csv", "_cleaned_non_numeric.csv")
    df.to_csv(cleaned_file_path, index=False)
    print(f"Cleaned non-numeric dataset saved: {cleaned_file_path}")

# Loop over datasets and process them
for file_name, dataset_name in file_paths.items():
    file_path = os.path.join(r"C:\Users\denos\OneDrive\Projects\AU_Soccer", file_name)
    try:
        clean_and_save_non_numeric_dataset(file_path, dataset_name)
    except Exception as e:
        print(f"Error processing {dataset_name}: {e}")

No issues detected in non-numeric columns for MatchInfo
Cleaned non-numeric dataset saved: C:\Users\denos\OneDrive\Projects\AU_Soccer\processed_enriched_match_information_2024_cleaned_non_numeric.csv
Issues detected and logged for PlayerSalary
Cleaned non-numeric dataset saved: C:\Users\denos\OneDrive\Projects\AU_Soccer\processed_enriched_player_information_2024_cleaned_non_numeric.csv
Issues detected and logged for PlayerInfo
Cleaned non-numeric dataset saved: C:\Users\denos\OneDrive\Projects\AU_Soccer\processed_player_names_cleaned_non_numeric.csv


In [58]:

import pandas as pd
import pyodbc
import os

# Set file paths
file_paths = {
    "processed_enriched_match_information_2024.csv": "MatchInfo",
    "processed_enriched_player_information_2024.csv": "PlayerSalary",
    "processed_player_names.csv": "PlayerInfo"
}

# Set log directory
log_dir = r"C:\Users\denos\OneDrive\Projects\AU_Soccer\logs"
os.makedirs(log_dir, exist_ok=True)

# Function to clean and validate non-numeric columns
def clean_and_validate_non_numeric_columns(df, dataset_name):
    issue_log = []
    # Remove empty columns
    df = df.dropna(axis=1, how='all')
    issue_log.append(f"Removed empty columns from {dataset_name}")

    # Handle empty strings and NaNs in non-numeric columns
    for col in df.select_dtypes(include=[object]).columns:
        empty_strings = df[col] == ""
        if empty_strings.sum() > 0:
            issue_log.append(f"{col}: {empty_strings.sum()} empty string values replaced.")
            df[col] = df[col].replace("", pd.NA)

        # Log NaNs in columns
        nan_values = df[col].isna().sum()
        if nan_values > 0:
            issue_log.append(f"{col}: {nan_values} NaN values found")

    # Log issues to a file
    if issue_log:
        with open(os.path.join(log_dir, f"{dataset_name}_non_numeric_issues.log"), "w") as log_file:
            log_file.write("\n".join(issue_log))
        print(f"Issues detected and logged for {dataset_name}")
    return df

# Function to upload the cleaned data to SQL Server
def upload_to_sql(df, table_name, server, database):
    # Create SQL connection string
    connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;"
    conn = pyodbc.connect(connection_string)
    cursor = conn.cursor()

    # Drop existing table if it exists
    cursor.execute(f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}")
    conn.commit()
    print(f"Dropped existing table: {table_name}")

    # Create table dynamically
    create_table_query = f"CREATE TABLE {table_name} ({', '.join([f'[{col}] NVARCHAR(MAX)' for col in df.columns])});"
    cursor.execute(create_table_query)
    conn.commit()

    # Insert data
    for _, row in df.iterrows():
        placeholders = ", ".join(["?" for _ in row])
        insert_query = f"INSERT INTO {table_name} VALUES ({placeholders})"
        cursor.execute(insert_query, *row)
    conn.commit()
    conn.close()
    print(f"Data uploaded to {table_name} in SQL Server.")

# Loop through each dataset and clean/upload
for file_name, dataset_name in file_paths.items():
    try:
        # Read the CSV file
        file_path = f"C:\\Users\\denos\\OneDrive\\Projects\\AU_Soccer\\{file_name}"
        df = pd.read_csv(file_path)

        # Clean non-numeric columns
        df = clean_and_validate_non_numeric_columns(df, dataset_name)

        # Upload cleaned data to SQL Server
        upload_to_sql(df, dataset_name, "RAMSEY_BOLTON\\SQLEXPRESS", "SoccerProjects")

    except Exception as e:
        print(f"Error processing {dataset_name}: {e}")

Issues detected and logged for MatchInfo
Dropped existing table: MatchInfo
Data uploaded to MatchInfo in SQL Server.
Issues detected and logged for PlayerSalary
Dropped existing table: PlayerSalary
Error processing PlayerSalary: ('42000', '[42000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]The incoming tabular data stream (TDS) remote procedure call (RPC) protocol stream is incorrect. Parameter 7 (""): The supplied value is not a valid instance of data type float. Check the source data for invalid values. An example of an invalid value is data of numeric type with scale greater than precision. (8023) (SQLExecDirectW)')
Issues detected and logged for PlayerInfo
Dropped existing table: PlayerInfo
Error processing PlayerInfo: ('42000', '[42000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]The incoming tabular data stream (TDS) remote procedure call (RPC) protocol stream is incorrect. Parameter 8 (""): The supplied value is not a valid instance of data type float. Check