In [1]:
import pandas as pd
from datetime import datetime
import re

In [2]:
def convert_date(date_str):
    try:
        # For format like '10-Feb-57'
        if re.match(r'\d{1,2}-[A-Za-z]{3}-\d{2}', date_str):
            date_obj = datetime.strptime(date_str, '%d-%b-%y')
            # Adjusting for century
            if date_obj.year > datetime.now().year:
                date_obj = date_obj.replace(year=date_obj.year - 100)
            return date_obj.strftime('%d/%m/%Y')
        
        # For format like '30 January 2022 (2022-01-30)'
        elif re.match(r'\d{1,2}\s[A-Za-z]+\s\d{4}\s\(\d{4}-\d{2}-\d{2}\)', date_str):
            date_obj = datetime.strptime(date_str.split('(')[1].strip(')'), '%Y-%m-%d')
            return date_obj.strftime('%d/%m/%Y')

        else:
            return date_str  # Return the original string if it doesn't match the expected formats
    except ValueError:
        return "Invalid Date Format"

In [3]:
past_games_df = pd.read_csv("C:/Users/guygi/OneDrive/Bureau/concaf_analytics/datasets/Africa Cup of Nations Matches.csv",)
past_games_df.columns = ['Date', 'HomeTeam', 'AwayTeam', 'HomeTeamGoal', 'AwayTeamGoal', 'Stage', 'SpecialWinConditions']

In [4]:
past_games_df['HomeTeamGoal'] = pd.to_numeric(past_games_df['HomeTeamGoal'], errors='coerce').fillna(0).astype(int)
past_games_df['AwayTeamGoal'] = pd.to_numeric(past_games_df['AwayTeamGoal'], errors='coerce').fillna(0).astype(int)

In [5]:
# Assuming df is your DataFrame and 'Date' is the column
past_games_df['Date'] = past_games_df['Date'].apply(convert_date)

In [6]:
# Apply str.strip() to all object-type columns
for col in ['HomeTeam', 'AwayTeam', 'Stage', 'SpecialWinConditions']:
    past_games_df[col] = past_games_df[col].str.strip()

In [7]:
# process team names
replacements = {
    'Congo-Kinshasa': 'Congo',
    'Congo-Léopoldville': 'Congo',
    'DR Congo': 'Congo',
    'Morocco': 'Maroc',
    'Tunisia': 'Tunisie',
    'Upper Volta': 'Burkina Faso',
    'Zaire': 'Congo'
}
past_games_df['HomeTeam'] = past_games_df['HomeTeam'].replace(replacements)
past_games_df['AwayTeam'] = past_games_df['AwayTeam'].replace(replacements)
past_games_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeTeamGoal,AwayTeamGoal,Stage,SpecialWinConditions
0,10/02/1957,Sudan,Egypt,1,2,Semifinals,
1,10/02/1957,Ethiopia,South Africa,0,0,Semifinals,Ethiopia wins due to disqualification of othe...
2,16/02/1957,Egypt,Ethiopia,4,0,Final,
3,22/05/1959,Egypt,Ethiopia,4,0,Final Tournament,
4,25/05/1959,Sudan,Ethiopia,1,0,Final Tournament,


In [8]:
# process team names
replacements = {
    'Semifinals': 'Semi-finals',
    'Final Tournament': 'Final',
    'Group A': 'Group',
    'Group B': 'Group',
    'Group C': 'Group',
    'Group D': 'Group',
    'Quarterfinals': 'Quarter-finals',
    'Third-place match': 'Third place',
    'Third place play-off': 'Third place',
    'Group E': 'Group',
    'Group F': 'Group'
}
past_games_df['Stage'] = past_games_df['Stage'].replace(replacements)
past_games_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeTeamGoal,AwayTeamGoal,Stage,SpecialWinConditions
0,10/02/1957,Sudan,Egypt,1,2,Semi-finals,
1,10/02/1957,Ethiopia,South Africa,0,0,Semi-finals,Ethiopia wins due to disqualification of othe...
2,16/02/1957,Egypt,Ethiopia,4,0,Final,
3,22/05/1959,Egypt,Ethiopia,4,0,Final,
4,25/05/1959,Sudan,Ethiopia,1,0,Final,


## Save past games

In [9]:
past_games_df.to_csv("C:/Users/guygi/OneDrive/Bureau/concaf_analytics/datasets/clean/PastGames.csv", encoding='utf-8-sig', index=False)