In [29]:
# Import libraries:
import pandas as pd
import os

Global variables and functions:

In [34]:
# Global variables:
dir = 'C:/Users/cmart/OneDrive - Bentley University/Research/Player Valuation Model'
eng_dir = 'C:/Users/cmart/OneDrive - Bentley University/Research/Player Valuation Model/ENG-Premier League'
export_dir = 'C:/Users/cmart/OneDrive - Bentley University/Research/Player Valuation Model/Merged Data'
season_ids = ['1718', '1819', '1920', '2021', '2122', '2223']
league_ids = ['ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A']
index_col_names = ['league', 'season', 'team']
SKIP_ROWS = [1] 
new_cols_dict = {'Unnamed: 0': 'league', 'Unnamed: 1': 'season', 'Unnamed: 2': 'team', 'Unnamed: 3': 'player', 
            'Unnamed: 4': 'nationality', 'Unnamed: 5': 'position', 'Unnamed: 6': 'age', 'Unnamed: 7': 'YOB'}

In [35]:
# Function that writes the output data to an Excel file:
def make_xl(path, df, file_name):
    file_path = os.path.join(path, f'{file_name}.xlsx')
    return df.to_excel(file_path, index=True)           # Remove index=True if getting permission error


# Function for importing and cleaning data at the league-season level:
def import_and_clean(path):

    # Create the dataframe:
    df = (pd
        .read_excel(path, header=SKIP_ROWS)
        .rename(columns=new_cols_dict)
        .drop([0])
        )
    
    # Clean index column names and set index to league:
    for name in index_col_names:
        df[name] = df[name].fillna(method='ffill')
    return df


# Function for merging two pandas dataframes:
def concatenate_dfs(df_list):
    merged_df = pd.concat(df_list)
    return merged_df

Import test data files:

In [27]:
# Two Test EPL paths:
path1 = "C:/Users/cmart/OneDrive - Bentley University/Research/Player Valuation Model/ENG-Premier League/ENG-Premier League_1920_full_join.xlsx"
path2 = "C:/Users/cmart/OneDrive - Bentley University/Research/Player Valuation Model/ENG-Premier League/ENG-Premier League_2021_full_join.xlsx"

# Create two dataframes:
df1 = import_and_clean(path1)
df2 = import_and_clean(path2)

Compare dataframe dimensions and column names:

In [25]:
# Compare the two dataframe dimensions:
df1_rows = df1.shape[0]
df1_cols = df1.shape[1]
df2_rows = df2.shape[0]
df2_cols = df2.shape[1]

print(f'df1 row count: {df1_rows}.\tdf1 column count: {df1_cols}\n')
print(f'df2 row count: {df2_rows}.\tdf2 column count: {df2_cols}\n')

# Compare dataframe column names:
df1_columns = df1.columns
df2_columns = df2.columns

# for x, y in zip(df1_columns, df2_columns):
#     print(f'DF1 Name: {x}\tDF2 Name: {y}\n')

df1 row count: 621.	df1 column count: 201

df2 row count: 653.	df2 column count: 201



Attempt to concatenate the two test dataframes:

In [28]:
# Set index for each dataframe before concatenating:
df1 = df1.set_index('league')
df2 = df2.set_index('league')


# Create one merged dataframe:
merged_df = pd.concat([df1,df2], axis=0)
merged_df.shape[1]

201

Send output to Excel for review:

In [15]:
make_xl(dir, merged_df, file_name='test_epl_merge')

Iterate the concatenatin process over many seasons in the EPL:

In [32]:
# Initialize dataframe list:
epl_df_list = []

# Import data:
for season in season_ids:

    # Dynamic file path:
    file_path = f"C:/Users/cmart/OneDrive - Bentley University/Research/Player Valuation Model/ENG-Premier League/ENG-Premier League_{season}_full_join.xlsx"

    # Import and clean the data:
    df = import_and_clean(file_path)
    df = df.set_index('league')

    # Add to list of datafrmes:
    epl_df_list.append(df)

# Concatenate all of the dataframes in the list:
epl_merged_df = concatenate_dfs(epl_df_list)

In [33]:
# Check and explore the merged EPL dataframe:
print(f'Merged Row Count: {epl_merged_df.shape[0]}\n') 
print(f'Merged Column Count: {epl_merged_df.shape[1]}') 

Merged Row Count: 3880

Merged Column Count: 201


Iterate over every season in every league:

In [37]:
# Import and concatenate data:
for league in league_ids:

    #Initialize empty list:
    league_df_list = []

    print(f'\nWorking on the {league} dataframe merge...\n')

    for season in season_ids:
        # Dynamic file path:
        file_path = f"C:/Users/cmart/OneDrive - Bentley University/Research/Player Valuation Model/{league}/{league}_{season}_full_join.xlsx"

        # Import and clean the data:
        df = import_and_clean(file_path)
        df = df.set_index('league')

        # Add to list of datafrmes:
        league_df_list.append(df)
        print(f'\nSuccessfully added the {season} dataframe for the {league}.\n')

    # Concatenate the dataframes from the list:
    league_merged_df = concatenate_dfs(league_df_list)
    print(f'\nThe {league} merge is complete, proceeed to dimensions check.\n')

    # Check row and column counts:
    print(f'\n{league} Merge Row Count: {league_merged_df.shape[0]}\n')
    print(f'\n{league} Merge Column Count: {league_merged_df.shape[1]}\n')

    # Send the merged dataframe to excel:
    user_bool = int(input("Enter 1 if DF dimensions OK, 0 otherwise: "))
    if user_bool == 1:
        print(f'\nThe {league} merge was successful. Sending to Excel...\n')
        make_xl(export_dir, league_merged_df, file_name=f'{league}_full_merge')
    else:
        print(f'\nThe {league} merge was unsuccessful. Proceeding to next league...\n')

print(f'\nProcess complete. See {export_dir} for results.\n')


Working on the ENG-Premier League dataframe merge...


Successfully added the 1718 dataframe for the ENG-Premier League.


Successfully added the 1819 dataframe for the ENG-Premier League.


Successfully added the 1920 dataframe for the ENG-Premier League.


Successfully added the 2021 dataframe for the ENG-Premier League.


Successfully added the 2122 dataframe for the ENG-Premier League.


Successfully added the 2223 dataframe for the ENG-Premier League.


The ENG-Premier League merge is complete, proceeed to dimensions check.


ENG-Premier League Merge Row Count: 3880


ENG-Premier League Merge Column Count: 201


The ENG-Premier League merge was successful. Sending to Excel...


Working on the ESP-La Liga dataframe merge...


Successfully added the 1718 dataframe for the ESP-La Liga.


Successfully added the 1819 dataframe for the ESP-La Liga.


Successfully added the 1920 dataframe for the ESP-La Liga.


Successfully added the 2021 dataframe for the ESP-La Liga.


Successfully ad