Hier entsteht das Projekt bla bla bla

In [1]:
import pandas as pd
import numpy as np
import re

In [None]:
def load_and_preprocess_data():
    player_stats = pd.read_csv('2021-2022_Football_Player_Stats.csv', encoding='latin1', sep=';')
    fifa_data = pd.read_csv('FIFA23_official_data.csv', encoding='latin1')
    
    columns_to_remove_fifa = [
        'Photo', 'Club Logo', 'Flag', 'Real Face', 'Best Overall Rating', 
        'Joined', 'Loaned From', 'Contract Valid Until', 'Release Clause', 'Kit Number'
    ]
    fifa_data = fifa_data.drop(columns=[col for col in columns_to_remove_fifa if col in fifa_data.columns])
    
    player_stats = player_stats[player_stats['Min'] >= 200]
    player_stats = player_stats[player_stats['MP'] >= 7]
    
    return player_stats, fifa_data


def extract_last_name_from_full(full_name):
    if pd.isna(full_name):
        return ""
    parts = str(full_name).strip().split()
    return parts[-1].lower() if parts else ""


def extract_names_from_fifa(fifa_name):
    if pd.isna(fifa_name):
        return "", ""
    
    name_str = str(fifa_name).strip()
    name_str = re.sub(r'[^a-zA-Z\s\.]', '', name_str)
    parts = name_str.split()
    
    if len(parts) >= 2:
        first_name_abbr = parts[0].replace('.', '').lower()
        last_name = parts[-1].lower()
        return first_name_abbr, last_name
    elif len(parts) == 1:
        return "", parts[0].lower()
    else:
        return "", ""


def get_last_name(name):
    return str(name).split()[-1].lower() if pd.notna(name) else ""


def find_best_matches(player_stats, fifa_data):
    player_stats_temp = player_stats.copy()
    fifa_data_temp = fifa_data.copy()
    
    player_stats_temp['last_name_temp'] = player_stats_temp['Player'].apply(extract_last_name_from_full)
    fifa_data_temp['last_name_temp'] = fifa_data_temp['Name'].apply(lambda x: extract_names_from_fifa(x)[1])
    
    common_last_names = set(player_stats_temp['last_name_temp']).intersection(set(fifa_data_temp['last_name_temp']))
    
    matches = []
    
    for last_name in common_last_names:
        stats_players = player_stats_temp[player_stats_temp['last_name_temp'] == last_name]
        fifa_players = fifa_data_temp[fifa_data_temp['last_name_temp'] == last_name]
        
        for _, stats_row in stats_players.iterrows():
            stats_full_name = stats_row['Player']
            stats_first_initial = stats_full_name[0].lower() if stats_full_name else ""
            
            for _, fifa_row in fifa_players.iterrows():
                fifa_full_name = fifa_row['Name']
                fifa_first_abbr = extract_names_from_fifa(fifa_full_name)[0]
                
                if (stats_first_initial == fifa_first_abbr or 
                    len(stats_players) == 1 or 
                    len(fifa_players) == 1):
                    
                    matches.append({
                        'stats_name': stats_full_name,
                        'fifa_name': fifa_full_name,
                        'last_name': last_name,
                        'stats_index': stats_row.name,
                        'fifa_index': fifa_row.name,
                        'confidence': 'high' if stats_first_initial == fifa_first_abbr else 'medium'
                    })
    
    return matches, player_stats_temp, fifa_data_temp


def remove_duplicates(player_stats_filtered, fifa_data_filtered):
    fifa_duplicates = fifa_data_filtered[fifa_data_filtered.duplicated(subset=['Name'], keep=False)]
    
    if len(fifa_duplicates) > 0:
        fifa_data_clean = fifa_data_filtered.sort_values('Overall', ascending=False).drop_duplicates(subset=['Name'], keep='first')
    else:
        fifa_data_clean = fifa_data_filtered.copy()
    
    player_stats_clean = player_stats_filtered.copy()
    player_stats_clean['unique_key'] = player_stats_clean.apply(
        lambda x: f"{x['Player']}_{x['Born']}_{x['Nation']}", 
        axis=1
    )
    
    transfer_players = player_stats_clean[player_stats_clean.duplicated(subset=['unique_key'], keep=False)]
    
    if len(transfer_players) > 0:    
        player_stats_clean = player_stats_clean.drop_duplicates(subset=['unique_key'], keep=False)
    
    player_stats_clean = player_stats_clean.drop('unique_key', axis=1)
    
    return player_stats_clean, fifa_data_clean


def create_final_datasets(player_stats_clean, fifa_data_clean):
    player_stats_temp = player_stats_clean.copy()
    fifa_data_temp = fifa_data_clean.copy()
    
    player_stats_temp['last_name_temp'] = player_stats_temp['Player'].apply(get_last_name)
    fifa_data_temp['last_name_temp'] = fifa_data_temp['Name'].apply(get_last_name)
    
    common_players_after_clean = set(player_stats_temp['last_name_temp']).intersection(
        set(fifa_data_temp['last_name_temp'])
    )
    
    player_stats_final = player_stats_temp[
        player_stats_temp['last_name_temp'].isin(common_players_after_clean)
    ].drop('last_name_temp', axis=1)
    
    fifa_data_final = fifa_data_temp[
        fifa_data_temp['last_name_temp'].isin(common_players_after_clean)
    ].drop('last_name_temp', axis=1)
    
    return player_stats_final, fifa_data_final


def create_mapping(player_stats_final, fifa_data_final):
    mapping_data = []
    
    for stats_idx, stats_row in player_stats_final.iterrows():
        stats_name = stats_row['Player']
        stats_last_name = get_last_name(stats_name)
        
        matching_fifa = fifa_data_final[
            fifa_data_final['Name'].apply(get_last_name) == stats_last_name
        ]
        
        if len(matching_fifa) > 0:
            fifa_name = matching_fifa['Name'].iloc[0]
            fifa_overall = matching_fifa['Overall'].iloc[0]
            
            mapping_data.append({
                'Stats_Player': stats_name,
                'FIFA_Player': fifa_name,
                'Last_Name': stats_last_name,
                'FIFA_Overall': fifa_overall,
                'Stats_Squad': stats_row['Squad'] if 'Squad' in stats_row else 'N/A',
                'FIFA_Club': matching_fifa['Club'].iloc[0] if 'Club' in matching_fifa.columns else 'N/A'
            })
    
    return pd.DataFrame(mapping_data)

def find_strict_matches(player_stats, fifa_data):
    player_stats_temp = player_stats.copy()
    fifa_data_temp = fifa_data.copy()
    
    player_stats_temp['last_name_temp'] = player_stats_temp['Player'].apply(get_last_name)
    fifa_data_temp['last_name_temp'] = fifa_data_temp['Name'].apply(get_last_name)
    
    matches = []
    
    for stats_idx, stats_row in player_stats_temp.iterrows():
        stats_name = stats_row['Player']
        stats_last_name = stats_row['last_name_temp']
        stats_first_initial = stats_name[0].lower() if stats_name else ""
        
        possible_fifa_matches = fifa_data_temp[fifa_data_temp['last_name_temp'] == stats_last_name]
        
        for fifa_idx, fifa_row in possible_fifa_matches.iterrows():
            fifa_name = fifa_row['Name']
            fifa_first_initial = fifa_name[0].lower() if fifa_name else ""
            
            if (stats_first_initial == fifa_first_initial or 
                len(possible_fifa_matches) == 1):
                
                matches.append({
                    'stats_name': stats_name,
                    'fifa_name': fifa_name,
                    'last_name': stats_last_name,
                    'stats_index': stats_idx,
                    'fifa_index': fifa_idx,
                    'confidence': 'high' if stats_first_initial == fifa_first_initial else 'unique_lastname'
                })
                break
    
    return matches, player_stats_temp, fifa_data_temp


def process_strict_matches(matches, player_stats, fifa_data):
    if len(matches) == 0:
        return None, None, None
    
    stats_indices = [match['stats_index'] for match in matches]
    fifa_indices = [match['fifa_index'] for match in matches]
    
    duplicate_stats = len(stats_indices) != len(set(stats_indices))
    duplicate_fifa = len(fifa_indices) != len(set(fifa_indices))
    
    if duplicate_stats or duplicate_fifa:
        seen_stats = set()
        seen_fifa = set()
        unique_matches = []
        
        for match in matches:
            if (match['stats_index'] not in seen_stats and 
                match['fifa_index'] not in seen_fifa):
                
                seen_stats.add(match['stats_index'])
                seen_fifa.add(match['fifa_index'])
                unique_matches.append(match)
        
        matches = unique_matches
    
    stats_indices = [match['stats_index'] for match in matches]
    fifa_indices = [match['fifa_index'] for match in matches]
    
    final_player_stats = player_stats.loc[stats_indices].drop('last_name_temp', axis=1)
    final_fifa_data = fifa_data.loc[fifa_indices].drop('last_name_temp', axis=1)
    
    validation_data = []
    for match in matches:
        stats_player = final_player_stats[final_player_stats['Player'] == match['stats_name']].iloc[0]
        fifa_player = final_fifa_data[final_fifa_data['Name'] == match['fifa_name']].iloc[0]
        
        validation_data.append({
            'Stats_Player': match['stats_name'],
            'FIFA_Player': match['fifa_name'], 
            'Last_Name': match['last_name'],
            'Stats_Squad': stats_player['Squad'] if 'Squad' in stats_player else 'N/A',
            'FIFA_Club': fifa_player['Club'] if 'Club' in fifa_player else 'N/A',
            'FIFA_Overall': fifa_player['Overall'],
            'Confidence': match['confidence'],
            'VALIDIERT': 'âœ“' 
        })
    
    validation_df = pd.DataFrame(validation_data)
    
    return final_player_stats, final_fifa_data, validation_df


def main():
    player_stats, fifa_data = load_and_preprocess_data()

    best_matches, player_stats_temp, fifa_data_temp = find_best_matches(player_stats, fifa_data)
    
    if len(best_matches) > 0:
        stats_indices = [match['stats_index'] for match in best_matches]
        fifa_indices = [match['fifa_index'] for match in best_matches]
        
        player_stats_filtered = player_stats_temp.loc[stats_indices].drop('last_name_temp', axis=1)
        fifa_data_filtered = fifa_data_temp.loc[fifa_indices].drop('last_name_temp', axis=1)
        
        player_stats_clean, fifa_data_clean = remove_duplicates(player_stats_filtered, fifa_data_filtered)
        
        player_stats_final, fifa_data_final = create_final_datasets(player_stats_clean, fifa_data_clean)
        
        mapping_df = create_mapping(player_stats_final, fifa_data_final)

        strict_matches, player_stats_strict, fifa_data_strict = find_strict_matches(player_stats_final, fifa_data_final)

        final_player_stats, final_fifa_data, validation_df = process_strict_matches(
            strict_matches, player_stats_strict, fifa_data_strict
        )
        
        if final_player_stats is not None:
            for i in range(min(3, len(validation_df))):
                row = validation_df.iloc[i]
            
            final_player_stats.to_csv('strict_player_stats.csv', index=False)
            final_fifa_data.to_csv('strict_fifa_data.csv', index=False)
            validation_df.to_csv('strict_validation.csv', index=False)

if __name__ == "__main__":
    main()