In [1]:
import pandas as pd
import re #use regex expressions to separate characters

df_euro = pd.read_csv('https://query.data.world/s/tp4rtb4ryxelnxhbftjtoqf7khpcw7?dws=00000')
df_euro_positions = pd.read_csv("https://query.data.world/s/v5qjxh7wpbff5b7dz7ouo3x4b5aiyo?dws=00000")
df_matches = pd.read_csv("https://query.data.world/s/fhxv7rlrmcmxle4ktj77f54c7kq426?dws=00000")

# column to search 
column_name = 'StatsName'  
search_strings = [
        "Goals", "Total Attempts", "Attempts on target", "Attempts off target", "Attempts Accuracy",
        "Saves", "Tackles won", "Clearances", "Recovered balls", "Passes completed", "Passes accuracy",
        "Distance covered (m)", "Top Speed (Km/h)", "Own-goals", "Goals conceded", "Attempts blocked",
        "Attempts on bar", "Attempts on post", "Passes attempted", "Dribbling", "Tackles", "Tackles lost",
        "Blocks", "Played Time", "Fouls suffered", "Fouls committed", "Assists", "Big Chances"]

# Escape each string in the search_strings list
escaped_search_strings = [re.escape(string) for string in search_strings]

# Create a regex pattern that matches any of the escaped search strings
search_pattern = '|'.join(escaped_search_strings)

# filtering the df to only include rows where the column contains any of the search strings
df_euro_stats = df_euro[df_euro[column_name].str.contains(search_pattern, na=False, regex=True)]
df_euro_stats.reset_index(drop=True, inplace=True)


#display(df_euro_stats)



In [2]:
df_positions = df_euro_positions[['ID', 'Role']]   # so now i have the 2nd dataset and want to reduce it to only ID and Role
df_positi = df_positions[['ID', 'Role']].rename(columns={'ID': 'PlayerID'}) #renaming ID to PlayerID to match the other dataset
# Reset index 
df_positi.reset_index(inplace=True)


df_europivot = df_euro_stats.pivot_table(index=['PlayerID', 'PlayerName', 'PlayerSurname', 'HomeTeamName', 'AwayTeamName','MatchID',"PlayedTime"],
                                         columns='StatsName',
                                         values='Value',
                                         aggfunc='sum')     #pivoting the 1st dataset to better undertsatnd the stats

#reset index
df_europivot.reset_index(inplace=True)

# Display the pivoted DataFrame
#display(df_positi)
#display(df_europivot)

In [3]:

# merging the datasets based on PlayerID
role_merge= pd.merge(df_europivot, df_positi[['PlayerID', 'Role']], on='PlayerID', how='left')

role_merge.rename(columns={'Role_y': 'Role'}, inplace=True) #merged_df got the Role as Role_y so i had to change it 


In [4]:
#merging datasets based on MatchID

matchid_merge = pd.merge(role_merge, df_matches[["MatchID" ,"RoundName"]], on = "MatchID", how= "left")
df_euro_all_stats= matchid_merge

In [5]:
# reorganize the columns
desired_columns_order = ['PlayerID', 'PlayerName', 'PlayerSurname', 'HomeTeamName', 'AwayTeamName','MatchID',"RoundName","PlayedTime", "Role", 'Goals', 'Assists', 'Saves',
                         'Total Attempts', 'Attempts on target', 'Attempts off target', 'Attempts Accuracy','Big Chances', 'Dribbling', 'Tackles',
                         'Tackles won', 'Fouls suffered', 'Fouls committed', 'Blocks', 'Clearances', 'Recovered balls', 'Passes completed',
                         'Passes accuracy', 'Distance covered (m)', 'Top Speed (Km/h)', 'Own-goals', 'Goals conceded']

# reorder the columns in euro_stats_df
df_euro_all_stats = df_euro_all_stats[desired_columns_order]


In [6]:
#functions

def clean_time(df): 
    """ Clean the time column of the DataFrame to a valid format.
    args: df: The input DataFrame containing the 'PlayedTime' column to be cleaned.
    return: df: A copy of the input DataFrame with the 'PlayedTime' column cleaned and formatted to 'min'.
    """
    df1 = df.copy()
    df1['PlayedTime'] = df1['PlayedTime'].apply(lambda x: f"{int(x) *130 // 8142 }min") 
    #since the PlayedTime was given in a weird way i decided to equal the max palyed time to the 130 min (the max time played in a game of the tournment)
    
    return df1
    
#df_euro_all_stats["Role"].value_counts() 


In [8]:
df2=clean_time(df_euro_all_stats)
display(df2)



Unnamed: 0,PlayerID,PlayerName,PlayerSurname,HomeTeamName,AwayTeamName,MatchID,RoundName,PlayedTime,Role,Goals,...,Fouls committed,Blocks,Clearances,Recovered balls,Passes completed,Passes accuracy,Distance covered (m),Top Speed (Km/h),Own-goals,Goals conceded
0,52148,Goran,Pandev,Austria,North Macedonia,2024442,final tournament,92min,forwards,1,...,1,0,0,0,18,69,10073,29.5,0,0
1,52148,Goran,Pandev,Austria,North Macedonia,2024442,final tournament,92min,forwards,1,...,1,0,0,0,18,69,10073,29.5,0,0
2,52148,Goran,Pandev,Austria,North Macedonia,2024442,final tournament,92min,forwards,1,...,1,0,0,0,18,69,10073,29.5,0,0
3,52148,Goran,Pandev,North Macedonia,Netherlands,2024445,final tournament,67min,forwards,0,...,0,0,0,0,12,67,7257,26.5,0,0
4,52148,Goran,Pandev,North Macedonia,Netherlands,2024445,final tournament,67min,forwards,0,...,0,0,0,0,12,67,7257,26.5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6144,250155055,Ché,Adams,England,Scotland,2024461,final tournament,81min,forwards,0,...,2,0,1,1,22,85,8441,31.3,0,0
6145,250155055,Ché,Adams,England,Scotland,2024461,final tournament,81min,forwards,0,...,2,0,1,1,22,85,8441,31.3,0,0
6146,250155055,Ché,Adams,Scotland,Czech Republic,2024452,final tournament,47min,forwards,0,...,1,0,0,0,13,87,4733,27,0,0
6147,250155055,Ché,Adams,Scotland,Czech Republic,2024452,final tournament,47min,forwards,0,...,1,0,0,0,13,87,4733,27,0,0
