Hier entsteht das Projekt bla bla bla

In [1]:
import pandas as pd
import numpy as np
import re

In [7]:
player_stats = pd.read_csv('2021-2022_Football_Player_Stats.csv', encoding='latin1', sep=';')
fifa_data = pd.read_csv('FIFA23_official_data.csv', encoding='latin1')

def extract_last_name_from_full(full_name):
    if pd.isna(full_name):
        return ""
    parts = str(full_name).strip().split()
    return parts[-1].lower() if parts else ""

def extract_names_from_fifa(fifa_name):
    if pd.isna(fifa_name):
        return "", ""
    
    name_str = str(fifa_name).strip()
    name_str = re.sub(r'[^a-zA-Z\s\.]', '', name_str)
    parts = name_str.split()
    
    if len(parts) >= 2:
        first_name_abbr = parts[0].replace('.', '').lower()
        last_name = parts[-1].lower()
        return first_name_abbr, last_name
    elif len(parts) == 1:
        return "", parts[0].lower()
    else:
        return "", ""

player_stats['last_name_temp'] = player_stats['Player'].apply(extract_last_name_from_full)
fifa_data['last_name_temp'] = fifa_data['Name'].apply(lambda x: extract_names_from_fifa(x)[1])

common_last_names = set(player_stats['last_name_temp']).intersection(set(fifa_data['last_name_temp']))

def find_best_matches():
    matches = []
    
    for last_name in common_last_names:
        stats_players = player_stats[player_stats['last_name_temp'] == last_name]
        fifa_players = fifa_data[fifa_data['last_name_temp'] == last_name]
        
        for _, stats_row in stats_players.iterrows():
            stats_full_name = stats_row['Player']
            stats_first_initial = stats_full_name[0].lower() if stats_full_name else ""
            
            for _, fifa_row in fifa_players.iterrows():
                fifa_full_name = fifa_row['Name']
                fifa_first_abbr = extract_names_from_fifa(fifa_full_name)[0]
                
                if (stats_first_initial == fifa_first_abbr or 
                    len(stats_players) == 1 or 
                    len(fifa_players) == 1):
                    
                    matches.append({
                        'stats_name': stats_full_name,
                        'fifa_name': fifa_full_name,
                        'last_name': last_name,
                        'stats_index': stats_row.name,
                        'fifa_index': fifa_row.name,
                        'confidence': 'high' if stats_first_initial == fifa_first_abbr else 'medium'
                    })
    
    return matches

best_matches = find_best_matches()

if len(best_matches) > 0:
    stats_indices = [match['stats_index'] for match in best_matches]
    fifa_indices = [match['fifa_index'] for match in best_matches]
    
    player_stats_filtered = player_stats.loc[stats_indices].drop('last_name_temp', axis=1)
    fifa_data_filtered = fifa_data.loc[fifa_indices].drop('last_name_temp', axis=1)

    for i in range(min(3, len(player_stats_filtered))):
        player_name = player_stats_filtered['Player'].iloc[i]
        goals = player_stats_filtered['Goals'].iloc[i] if 'Goals' in player_stats_filtered.columns else 'N/A'
        assists = player_stats_filtered['Assists'].iloc[i] if 'Assists' in player_stats_filtered.columns else 'N/A'
        minutes = player_stats_filtered['Min'].iloc[i] if 'Min' in player_stats_filtered.columns else 'N/A'
        
        fifa_match = fifa_data_filtered[fifa_data_filtered['Name'].str.contains(player_name.split()[-1], na=False)]
        fifa_overall = fifa_match['Overall'].iloc[0] if len(fifa_match) > 0 else 'N/A'
    
    player_stats_filtered.to_csv('player_stats_complete_filtered.csv', index=False)
    fifa_data_filtered.to_csv('fifa_data_complete_filtered.csv', index=False)
    
    mapping_df = pd.DataFrame(best_matches)
    
    combined_data = player_stats_filtered.copy()

In [8]:
player_stats_filtered = pd.read_csv('player_stats_complete_filtered.csv')
fifa_data_filtered = pd.read_csv('fifa_data_complete_filtered.csv')

fifa_duplicates = fifa_data_filtered[fifa_data_filtered.duplicated(subset=['Name'], keep=False)]

if len(fifa_duplicates) > 0:
    fifa_data_clean = fifa_data_filtered.sort_values('Overall', ascending=False).drop_duplicates(subset=['Name'], keep='first')
else:
    fifa_data_clean = fifa_data_filtered.copy()

stats_duplicates = player_stats_filtered[player_stats_filtered.duplicated(subset=['Player'], keep=False)]

if len(stats_duplicates) > 0:
    def merge_duplicate_players(duplicates_df):
        merged_players = []
        
        for player_name in duplicates_df['Player'].unique():
            player_entries = duplicates_df[duplicates_df['Player'] == player_name]
            
            if len(player_entries) == 1:
                merged_players.append(player_entries.iloc[0])
                continue
            
            merged_player = player_entries.iloc[0].copy()
            
            sum_cols = ['MP', 'Starts', 'Min', 'Goals', 'Assists', 'Shots', 'SoT', 'PKatt', 
                       'PasTotAtt', 'PasShoAtt', 'PasMedAtt', 'PasLonAtt', 'PasAss',
                       'Tkl', 'TklWon', 'Press', 'Blocks', 'Int', 'Clr', 'Touches',
                       'DriAtt', 'Carries', 'CrdY', 'CrdR', 'Fls', 'Off', 'AerWon', 'AerLost']
            
            for col in sum_cols:
                if col in player_entries.columns:
                    merged_player[col] = player_entries[col].sum()
            
            avg_cols = ['SoT%', 'G/Sh', 'G/SoT', 'PasTotCmp%', 'PasShoCmp%', 'PasMedCmp%', 
                       'PasLonCmp%', 'TklDri%', 'Press%', 'DriSucc%', 'Rec%', 'AerWon%']
            
            for col in avg_cols:
                if col in player_entries.columns:
                    merged_player[col] = player_entries[col].mean()
            
            weighted_cols = ['ShoDist', 'PasTotDist', 'PasTotPrgDist', 'CarTotDist', 'CarPrgDist']
            
            for col in weighted_cols:
                if col in player_entries.columns and 'Min' in player_entries.columns:
                    total_minutes = player_entries['Min'].sum()
                    if total_minutes > 0:
                        weighted_avg = (player_entries[col] * player_entries['Min']).sum() / total_minutes
                        merged_player[col] = weighted_avg
            
            if 'Squad' in player_entries.columns:
                unique_squads = player_entries['Squad'].unique()
                if len(unique_squads) > 1:
                    merged_player['Squad'] = ' / '.join(unique_squads)
            
            if 'Min' in merged_player and merged_player['Min'] > 0:
                merged_player['90s'] = merged_player['Min'] / 90
            
            merged_players.append(merged_player)
        
        return pd.DataFrame(merged_players)
    
    unique_players = player_stats_filtered.drop_duplicates(subset=['Player'], keep=False)
    player_stats_clean = pd.concat([
        unique_players,
        merge_duplicate_players(stats_duplicates)
    ], ignore_index=True)
    
else:
    player_stats_clean = player_stats_filtered.copy()

def get_last_name(name):
    return str(name).split()[-1].lower() if pd.notna(name) else ""

player_stats_clean['last_name_temp'] = player_stats_clean['Player'].apply(get_last_name)
fifa_data_clean['last_name_temp'] = fifa_data_clean['Name'].apply(get_last_name)

common_players_after_clean = set(player_stats_clean['last_name_temp']).intersection(
    set(fifa_data_clean['last_name_temp'])
)

player_stats_final = player_stats_clean[
    player_stats_clean['last_name_temp'].isin(common_players_after_clean)
].drop('last_name_temp', axis=1)

fifa_data_final = fifa_data_clean[
    fifa_data_clean['last_name_temp'].isin(common_players_after_clean)
].drop('last_name_temp', axis=1)

player_stats_final.to_csv('player_stats_cleaned_separate.csv', index=False)

fifa_data_final.to_csv('fifa_data_cleaned_separate.csv', index=False)

mapping_data = []

for stats_idx, stats_row in player_stats_final.iterrows():
    stats_name = stats_row['Player']
    stats_last_name = get_last_name(stats_name)
    
    matching_fifa = fifa_data_final[
        fifa_data_final['Name'].apply(get_last_name) == stats_last_name
    ]
    
    if len(matching_fifa) > 0:
        fifa_name = matching_fifa['Name'].iloc[0]
        fifa_overall = matching_fifa['Overall'].iloc[0]
        
        mapping_data.append({
            'Stats_Player': stats_name,
            'FIFA_Player': fifa_name,
            'Last_Name': stats_last_name,
            'FIFA_Overall': fifa_overall,
            'Stats_Squad': stats_row['Squad'] if 'Squad' in stats_row else 'N/A',
            'FIFA_Club': matching_fifa['Club'].iloc[0] if 'Club' in matching_fifa.columns else 'N/A'
        })

mapping_df = pd.DataFrame(mapping_data)
mapping_df.to_csv('player_mapping_separate.csv', index=False)

In [9]:
player_stats_cleaned_separate = pd.read_csv('player_stats_cleaned_separate.csv')
player_stats_cleaned_separate

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
0,613,Patrick Cutrone,ITA,FW,Empoli,Serie A,24.0,1998,28,15,...,1.28,1.28,0.45,0.06,0.00,0.0,3.14,0.71,2.76,20.40
1,1403,Aleksandr Kokorin,RUS,FWMF,Fiorentina,Serie A,31.0,1991,6,0,...,0.00,0.00,0.00,0.00,0.00,0.0,1.67,8.33,8.33,50.00
2,2229,David Raya,ESP,GK,Brentford,Premier League,26.0,1995,24,24,...,0.00,0.00,0.00,0.00,0.04,0.0,5.25,0.00,0.00,0.00
3,676,Jason Denayer,BEL,DF,Lyon,Ligue 1,26.0,1995,15,12,...,0.00,0.09,0.54,0.00,0.09,0.0,6.67,1.62,1.89,46.20
4,2855,Hannes Wolf,AUT,FWMF,M'Gladbach,Bundesliga,23.0,1999,7,2,...,0.00,1.18,2.94,0.00,0.00,0.0,8.82,0.00,2.06,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1775,1291,Curtis Jones,ENG,MF,Liverpool,Premier League,21.0,2001,30,20,...,0.22,0.21,0.95,0.00,0.00,0.0,8.00,1.48,2.10,41.20
1776,1292,Phil Jones,ENG,DF,Manchester Utd,Premier League,30.0,1992,8,4,...,0.00,0.00,2.22,0.00,0.00,0.0,6.67,1.12,0.00,100.00
1777,1554,Maxime Lopez,FRA,MF,Sassuolo,Serie A,24.0,1997,70,66,...,0.00,0.37,1.38,0.00,0.00,0.0,9.17,0.42,0.56,43.80
1778,1603,Giangiacomo Magnani,ITA,DF,Sampdoria / Hellas Verona,Serie A,26.0,1995,18,8,...,0.00,0.00,0.74,0.00,0.00,0.0,5.19,7.20,1.53,86.75


In [10]:
fifa_data_cleaned_separate = pd.read_csv('fifa_data_cleaned_separate.csv')
fifa_data_cleaned_separate

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,Real Face,Position,Joined,Loaned From,Contract Valid Until,Height,Weight,Release Clause,Kit Number,Best Overall Rating
0,192985,K. De Bruyne,31,https://cdn.sofifa.net/players/192/985/23_60.png,Belgium,https://cdn.sofifa.net/flags/be.png,91,91,Manchester City,https://cdn.sofifa.net/teams/10/30.png,...,Yes,"<span class=""pos pos13"">RCM","Aug 30, 2015",,2025,181cm,70kg,â¬198.9M,17.0,
1,165153,K. Benzema,34,https://cdn.sofifa.net/players/165/153/23_60.png,France,https://cdn.sofifa.net/flags/fr.png,91,91,Real Madrid CF,https://cdn.sofifa.net/teams/243/30.png,...,Yes,"<span class=""pos pos21"">CF","Jul 9, 2009",,2023,185cm,81kg,â¬131.2M,9.0,
2,158023,L. Messi,35,https://cdn.sofifa.net/players/158/023/23_60.png,Argentina,https://cdn.sofifa.net/flags/ar.png,91,91,Paris Saint-Germain,https://cdn.sofifa.net/teams/73/30.png,...,Yes,"<span class=""pos pos23"">RW","Aug 10, 2021",,2023,169cm,67kg,â¬99.9M,30.0,
3,188545,R. Lewandowski,33,https://cdn.sofifa.net/players/188/545/23_60.png,Poland,https://cdn.sofifa.net/flags/pl.png,91,91,FC Barcelona,https://cdn.sofifa.net/teams/241/30.png,...,Yes,"<span class=""pos pos25"">ST","Jul 18, 2022",,2025,185cm,81kg,â¬172.2M,9.0,
4,192119,T. Courtois,30,https://cdn.sofifa.net/players/192/119/23_60.png,Belgium,https://cdn.sofifa.net/flags/be.png,90,91,Real Madrid CF,https://cdn.sofifa.net/teams/243/30.png,...,Yes,"<span class=""pos pos0"">GK","Aug 9, 2018",,2026,199cm,96kg,â¬191.3M,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2658,261064,22Â Shin Jung,19,https://cdn.sofifa.net/players/261/064/22_60.png,Korea Republic,https://cdn.sofifa.net/flags/kr.png,48,63,Daegu FC,https://cdn.sofifa.net/teams/2056/30.png,...,No,"<span class=""pos pos29"">RES","Feb 1, 2021",,2025,175cm,71kg,â¬206K,40.0,
2659,259219,22Â N. Dias,22,https://cdn.sofifa.net/players/259/219/22_60.png,India,https://cdn.sofifa.net/flags/in.png,47,54,FC Goa,https://cdn.sofifa.net/teams/113298/30.png,...,No,"<span class=""pos pos29"">RES","Jul 1, 2018",,2022,180cm,61kg,â¬140K,40.0,
2660,255856,S. Mandi,20,https://cdn.sofifa.net/players/255/856/23_60.png,India,https://cdn.sofifa.net/flags/in.png,47,58,Jamshedpur FC,https://cdn.sofifa.net/teams/114168/30.png,...,No,"<span class=""pos pos28"">SUB","Jan 6, 2020",,2023,174cm,73kg,â¬155K,19.0,
2661,261818,22Â R. Gallagher,20,https://cdn.sofifa.net/players/261/818/22_60.png,Republic of Ireland,https://cdn.sofifa.net/flags/ie.png,47,61,Finn Harps,https://cdn.sofifa.net/teams/111131/30.png,...,No,"<span class=""pos pos29"">RES","Jan 1, 2018",,2022,180cm,74kg,â¬239K,29.0,


In [11]:
player_stats = pd.read_csv('player_stats_cleaned_separate.csv')
fifa_data = pd.read_csv('fifa_data_cleaned_separate.csv')

def get_last_name(name):
    return str(name).split()[-1].lower() if pd.notna(name) else ""

player_stats['last_name_temp'] = player_stats['Player'].apply(get_last_name)
fifa_data['last_name_temp'] = fifa_data['Name'].apply(get_last_name)

def find_strict_matches():
    matches = []
    
    for stats_idx, stats_row in player_stats.iterrows():
        stats_name = stats_row['Player']
        stats_last_name = stats_row['last_name_temp']
        stats_first_initial = stats_name[0].lower() if stats_name else ""
        
        possible_fifa_matches = fifa_data[fifa_data['last_name_temp'] == stats_last_name]
        
        for fifa_idx, fifa_row in possible_fifa_matches.iterrows():
            fifa_name = fifa_row['Name']
            fifa_first_initial = fifa_name[0].lower() if fifa_name else ""
            
            if (stats_first_initial == fifa_first_initial or 
                len(possible_fifa_matches) == 1):
                
                matches.append({
                    'stats_name': stats_name,
                    'fifa_name': fifa_name,
                    'last_name': stats_last_name,
                    'stats_index': stats_idx,
                    'fifa_index': fifa_idx,
                    'confidence': 'high' if stats_first_initial == fifa_first_initial else 'unique_lastname'
                })
                break  
    
    return matches

strict_matches = find_strict_matches()

if len(strict_matches) > 0:
    stats_indices = [match['stats_index'] for match in strict_matches]
    fifa_indices = [match['fifa_index'] for match in strict_matches]
    
    duplicate_stats = len(stats_indices) != len(set(stats_indices))
    duplicate_fifa = len(fifa_indices) != len(set(fifa_indices))
    
    if duplicate_stats or duplicate_fifa:
        seen_stats = set()
        seen_fifa = set()
        unique_matches = []
        
        for match in strict_matches:
            if (match['stats_index'] not in seen_stats and 
                match['fifa_index'] not in seen_fifa):
                
                seen_stats.add(match['stats_index'])
                seen_fifa.add(match['fifa_index'])
                unique_matches.append(match)
        
        strict_matches = unique_matches
    
    stats_indices = [match['stats_index'] for match in strict_matches]
    fifa_indices = [match['fifa_index'] for match in strict_matches]
    
    final_player_stats = player_stats.loc[stats_indices].drop('last_name_temp', axis=1)
    final_fifa_data = fifa_data.loc[fifa_indices].drop('last_name_temp', axis=1)
    
    if len(final_player_stats) != len(final_fifa_data):
        stats_players = set(final_player_stats['Player'])
        fifa_last_names = set(final_fifa_data['Name'].apply(get_last_name))
        
        problem_players = []
        for player in stats_players:
            if get_last_name(player) not in fifa_last_names:
                problem_players.append(player)
    
    validation_data = []
    for match in strict_matches:
        stats_player = final_player_stats[final_player_stats['Player'] == match['stats_name']].iloc[0]
        fifa_player = final_fifa_data[final_fifa_data['Name'] == match['fifa_name']].iloc[0]
        
        validation_data.append({
            'Stats_Player': match['stats_name'],
            'FIFA_Player': match['fifa_name'], 
            'Last_Name': match['last_name'],
            'Stats_Squad': stats_player['Squad'] if 'Squad' in stats_player else 'N/A',
            'FIFA_Club': fifa_player['Club'] if 'Club' in fifa_player else 'N/A',
            'FIFA_Overall': fifa_player['Overall'],
            'Confidence': match['confidence'],
            'VALIDIERT': '✅' 
        })
    
    validation_df = pd.DataFrame(validation_data)
    
    final_player_stats.to_csv('strict_player_stats.csv', index=False)

    final_fifa_data.to_csv('strict_fifa_data.csv', index=False)

    validation_df.to_csv('strict_validation.csv', index=False)

    stats_last_names = set(final_player_stats['Player'].apply(get_last_name))
    fifa_last_names = set(final_fifa_data['Name'].apply(get_last_name))
    
    missing_in_fifa = stats_last_names - fifa_last_names
    missing_in_stats = fifa_last_names - stats_last_names

player_stats = player_stats.drop('last_name_temp', axis=1)
fifa_data = fifa_data.drop('last_name_temp', axis=1)