In [1]:
# Let's load the two CSV files first to inspect their content.
import pandas as pd

# Load the two CSV files
outfielder_data = pd.read_csv('../data/processed/outfielder_opi.csv')
team_ratings = pd.read_csv('../data/processed/team_ratings.csv')

In [2]:
# First, we'll create a dictionary to map season column names in the team_ratings to the corresponding seasons in the outfielder data.
season_mapping = {
    '2017/2018': '2017-2018',
    '2018/2019': '2018-2019',
    '2019/2020': '2019-2020',
    '2020/2021': '2020-2021'
}

# Create a function to retrieve the appropriate team rating based on the season and team
def get_team_rating(row, ratings_df, season_mapping):
    # Get the season and team
    team = row['Team']
    season = row['Season']
    
    # Get the corresponding column for the season in the ratings dataframe
    if season in season_mapping:
        season_column = season_mapping[season]
        # Find the rating for the specific team and season
        team_rating = ratings_df.loc[ratings_df['team'] == team, season_column]
        if not team_rating.empty:
            return team_rating.values[0]
    return None  # Return None if no match is found

In [3]:
# Create a manual mapping of the teams with mismatched names
team_name_mapping = {
    'Manchester Utd': 'Man United',
    'Manchester City': 'Man City',
    'Mainz 05': 'Mainz',
    'Celta Vigo': 'Celta',
    'Athletic Club': 'Ath Bilbao',
    'Real Sociedad': 'Sociedad',
    'Leicester City': 'Leicester',
    'Stoke City': 'Stoke',
    'Saint-Étienne': 'St Etienne',
    'Atlético Madrid': 'Ath Madrid',
    'Köln': 'FC Koln',
    'Hertha BSC': 'Hertha',
    'Arminia': 'Bielefeld',
    'Swansea City': 'Swansea',
    'Hellas Verona': 'Verona',
    'Leeds United': 'Leeds',
    'Hamburger SV': 'Hamburg',
    'Düsseldorf': 'Fortuna Dusseldorf',
    'Rayo Vallecano': 'Vallecano',
    'Norwich City': 'Norwich',
    'Espanyol': 'Espanol',
    'Alavés': 'Alaves',
    'Paris S-G': 'Paris SG',
    'Leganés': 'Leganes',
    'Eint Frankfurt': 'Ein Frankfurt',
    'Newcastle Utd': 'Newcastle',
    'Nîmes': 'Nimes',
    'SPAL': 'Spal',
    "M'Gladbach": "M'gladbach",
    'La Coruña': 'La Coruna',
    'Málaga': 'Malaga',
    'Sheffield Utd': 'Sheffield United',
    'Hannover 96': 'Hannover',
    'Cádiz': 'Cadiz'
}

# Update the team names in the outfielder data
outfielder_data['Team'] = outfielder_data['Team'].replace(team_name_mapping)

# Reapply the previous team rating matching logic
outfielder_data['Team_Rating'] = outfielder_data.apply(get_team_rating, axis=1, ratings_df=team_ratings, season_mapping=season_mapping)


In [4]:
#Save the data
outfielder_data.to_csv('../data/processed/outfielder_data_with_team_ratings.csv', index=False)

In [5]:
gk_data = pd.read_csv('../data/processed/goalkeeper_opi.csv')

In [6]:
# Apply the same logic to the goalkeeper data
gk_data['Team'] = gk_data['Team'].replace(team_name_mapping)
gk_data['Team_Rating'] = gk_data.apply(get_team_rating, axis=1, ratings_df=team_ratings, season_mapping=season_mapping)

# Save the goalkeeper data
gk_data.to_csv('../data/processed/goalkeeper_data_with_team_ratings.csv', index=False)

In [8]:
columns_to_include = [
    'Player', 'Team', 'Age', 'Nationality', 'Season', 
    'MV1', 'MV2','Position', 'Comp', 'MP', 'Starts', 
    'Min', '90s', 'Overall_Performance_Index', 'Team_Rating'
]

# Merge the outfielder and goalkeeper data
merged_data = pd.concat([outfielder_data[columns_to_include], gk_data[columns_to_include]])

# Save the merged data
merged_data.to_csv('../data/processed/merged_data_with_team_ratings.csv', index=False)
