In [1]:
import os
import pandas as pd

In [2]:
def load_and_concatenate_csv(base_dir):
    """
    Load and concatenate all CSV files from the specified base directory, excluding those in 'archive' folders.

    Parameters:
    - base_dir (str): The base directory containing the CSV files.

    Returns:
    - pd.DataFrame: A concatenated DataFrame of all CSV files.
    """
    dataframes = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".csv") and "archive" not in root:
                file_path = os.path.join(root, file)
                try:
                    df = pd.read_csv(file_path)
                    dataframes.append(df)
                    print(f"Loaded {file_path}")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    return pd.concat(dataframes, ignore_index=True)

In [5]:
HISTORICAL_BASE_DIR = "../data/tm/raw/historical_market_values"
BIO_BASE_DIR = "../data/tm/raw/bio"


In [None]:
tm_values = load_and_concatenate_csv(HISTORICAL_BASE_DIR)

In [None]:
player_bio = load_and_concatenate_csv(BIO_BASE_DIR)

In [22]:
player_bio = player_bio[['player_id', 'player_name', 'position']]
player_bio = player_bio.drop_duplicates(subset='player_id')

In [23]:
# Remove the 'season' column
tm_values_filtered = tm_values.drop(columns=['season', 'league_code'])

# Drop duplicates based on the remaining columns
tm_values_unique = tm_values_filtered.drop_duplicates()

tm_values_unique = tm_values_unique.dropna(subset=['date'])

In [24]:
# Strip leading/trailing spaces from the 'date' column
tm_values_unique['date'] = tm_values_unique['date'].str.strip()
tm_values_unique['date'] = pd.to_datetime(tm_values_unique['date'], format='%b %d, %Y')

In [25]:
# Step 4: Define the date range for filtering
start_date = '2017-05-01'
end_date = '2021-07-31'

# Filter the DataFrame based on the date range
filtered_tm_values = tm_values_unique[
    (tm_values_unique['date'] >= start_date) & (tm_values_unique['date'] <= end_date)
]

In [26]:
merged_df = pd.merge(filtered_tm_values, player_bio, left_on='tm_id', right_on='player_id')
merged_df = merged_df.drop(columns=['tm_id'])

In [27]:
merged_df.to_csv('../data/tm/processed/tm_values.csv', index=False)
