# Data Warehouse Project
<p style="text-align: left"><i>Gallo Giovanni 252308</i></p>

# Data cleaning

### **Imports and Utility functions**

In [1]:
import pandas as pd
import rich
import rich.table
import rich.console
import re
from typing import List
import numpy as np
from sqlalchemy import create_engine, text


def print_info(df, check: bool, title: str = None):
    console = rich.console.Console()
    columns = df.columns.tolist()
    rows = df.shape[0]

    table = rich.table.Table()
    if check:
        table = rich.table.Table(title=title)
    table.add_column('Name')
    table.add_column('Type')
    table.add_column('Null Values %')

    for c in columns:
        table.add_row(c, df[c].dtype.name, f"{(df[c].isna().sum() / rows * 100):.2f}")
        
    console.print(table)


def count_unicode_escaped(df: pd.DataFrame, columns: List[str] = None) -> int:
    pattern = re.compile(r'\\u[0-9a-fA-F]{4}')
    count = 0

    if columns is None:
        columns = df.select_dtypes(include='object').columns.tolist()

    for col in columns:
        count += df[col].astype(str).apply(lambda x: bool(pattern.search(x))).sum()

    return count


def decode_unicode_escaped(df: pd.DataFrame, columns: List[str] = None) -> pd.DataFrame:
    if columns is None:
        columns = df.select_dtypes(include='object').columns.tolist()

    for col in columns:
        df[col] = df[col].astype(str).apply(lambda x: x.encode().decode('unicode_escape') if '\\u' in x else x)

    return df

### **Reading Input Data**

Match_Event Table

In [2]:
matchEvent_df = pd.read_csv('dataset/match_event.csv', na_values = 'None')
matchEvent_df.head()

Unnamed: 0,id,club_id,match_id,players_id,matchperiod,eventsec,eventname,action,modifier,x_begin,y_begin,x_end,y_end,is_success
0,26588,81,364,178,1H,2026.197574,Others on the ball,Touch,opportunity,97,58,95.0,68.0,
1,67599,57,339,1838,1H,387.095298,Others on the ball,Touch,opportunity,88,32,91.0,49.0,
2,106637,47,319,1572,2H,2364.7139,Others on the ball,Touch,opportunity,92,44,87.0,0.0,
3,142338,34,295,1486,2H,2092.651843,Others on the ball,Touch,opportunity,94,69,92.0,70.0,
4,209035,63,253,1261,2H,1451.561531,Others on the ball,Touch,opportunity,92,58,93.0,57.0,


Match Table

In [3]:
match_df = pd.read_csv('dataset/match.csv', na_values = 'None')
match_df.head()

Unnamed: 0,id,dateutc,competition,season,venue,home_club,away_club,winner,goal_by_home_club,goal_by_away_club,referee_id
0,0,2018-05-20 18:45:00,Italian first division,2018,"""""",Lazio,Internazionale,Internazionale,2,3,356
1,1,2018-05-20 18:45:00,Italian first division,2018,MAPEI Stadium - Citt\u00e0 del Tricolore,Sassuolo,Roma,Roma,0,1,338
2,2,2018-05-20 16:00:00,Italian first division,2018,"""""",Cagliari,Atalanta,Cagliari,1,0,389
3,3,2018-05-20 16:00:00,Italian first division,2018,"""""",Chievo,Benevento,Chievo,1,0,107
4,4,2018-05-20 16:00:00,Italian first division,2018,"""""",Udinese,Bologna,Udinese,1,0,381


Player Table

In [4]:
player_df = pd.read_csv('dataset/player.csv', na_values = 'None')
player_df.head(20)

Unnamed: 0,id,firstname,lastname,birthdate,country,position,foot,height
0,0,Harun,Tekin,1989-06-17,Turkey,Goalkeeper,right,187
1,1,Malang,Sarr,1999-01-23,France,Defender,left,182
2,2,Over,Mandanda,1998-10-26,France,Goalkeeper,"""""",176
3,3,Alfred John Momar,N'Diaye,1990-03-06,France,Midfielder,right,187
4,4,Ibrahima,Konat\u00e9,1999-05-25,France,Defender,right,192
5,5,Jasper,Cillessen,1989-04-22,Netherlands,Goalkeeper,right,185
6,6,Toby,Alderweireld,1989-03-02,Belgium,Defender,right,187
7,7,Jan,Vertonghen,1987-04-24,Belgium,Defender,left,189
8,8,Alexander,Djiku,1994-08-09,France,Defender,right,182
9,9,Christian,Dannemann Eriksen,1992-02-14,Denmark,Midfielder,right,180


Club Table

In [5]:
club_df = pd.read_csv('dataset/club.csv', na_values = 'None')
club_df.head()

Unnamed: 0,id,name,officialname,country
0,0,Newcastle United,Newcastle United FC,England
1,1,Celta de Vigo,Real Club Celta de Vigo,Spain
2,2,Espanyol,Reial Club Deportiu Espanyol,Spain
3,3,Deportivo Alav\u00e9s,Deportivo Alav\u00e9s,Spain
4,4,Levante,Levante UD,Spain


Referee Table

In [6]:
referee_df = pd.read_csv('dataset/referee.csv', na_values = 'None')
referee_df.head()

Unnamed: 0,id,firstname,lastname,birthdate,country
0,0,Alexander,Guzmán Bonilla,"""""",Colombia
1,1,Simon,Bennett,"""""",England
2,2,Harm,Osmers,1985-01-28,Germany
3,3,Frederick,Assmuth,1977-12-20,Germany
4,4,Frédéric,Hebrard,"""""",France


## **Data Understanding**

#### **Data Dimension**

In [7]:
rows, columns = matchEvent_df.shape
print(f'Match_event: {rows} rows and {columns} columns, {matchEvent_df.size} elements in total')
rows, columns = match_df.shape
print(f'Match: {rows} rows and {columns} columns, {match_df.size} elements in total')
rows, columns = player_df.shape
print(f'Player: {rows} rows and {columns} columns, {player_df.size} elements in total')
rows, columns = club_df.shape
print(f'Club: {rows} rows and {columns} columns, {club_df.size} elements in total')
rows, columns = referee_df.shape
print(f'Referee: {rows} rows and {columns} columns, {referee_df.size} elements in total')

Match_event: 2845357 rows and 14 columns, 39834998 elements in total
Match: 1340 rows and 11 columns, 14740 elements in total
Player: 3603 rows and 8 columns, 28824 elements in total
Club: 142 rows and 4 columns, 568 elements in total
Referee: 626 rows and 5 columns, 3130 elements in total


#### **Attribute List**

In [8]:
print_info(matchEvent_df, True, "Match_event")
print_info(match_df, True, "Match")
print_info(player_df, True, "Player")
print_info(club_df, True, "Club")
print_info(referee_df, True, "Referee")

## **Data Cleaning**

### Cleaning and Removal of Escaped Unicode Characters from CSV Files

This notebook demonstrates the process of cleaning and removing escaped Unicode characters (such as `\u00e9`, `\u2013`, etc.) from CSV files. These sequences often appear in Kaggle datasets due to encoding issues or improper file exports. <br>
The main goal is to detect and decode the escaped Unicode sequences present as raw string literals within the CSV fields, so that the correct characters (accents, special symbols, etc.) are restored, resulting in a clean and readable dataset.

In [9]:
print(f"\nAnalizzando: club")
before = count_unicode_escaped(club_df)
print(f"Count caratteri unicode escape PRIMA: {before}")
club_df = decode_unicode_escaped(club_df) if before > 0 else club_df
after = count_unicode_escaped(club_df) if before > 0 else 0
print(f"Count caratteri unicode escape DOPO: {after}")

print(f"\nAnalizzando: player")
before = count_unicode_escaped(player_df)
print(f"Count caratteri unicode escape PRIMA: {before}")
player_df = decode_unicode_escaped(player_df) if before > 0 else player_df
after = count_unicode_escaped(player_df) if before > 0 else 0
print(f"Count caratteri unicode escape DOPO: {after}")

print(f"\nAnalizzando: referee")
before = count_unicode_escaped(referee_df)
print(f"Count caratteri unicode escape PRIMA: {before}")
referee_df = decode_unicode_escaped(referee_df) if before > 0 else referee_df
after = count_unicode_escaped(referee_df) if before > 0 else 0
print(f"Count caratteri unicode escape DOPO: {after}")

print(f"\nAnalizzando: match")
before = count_unicode_escaped(match_df)
print(f"Count caratteri unicode escape PRIMA: {before}")
match_df = decode_unicode_escaped(match_df) if before > 0 else match_df
after = count_unicode_escaped(match_df) if before > 0 else 0
print(f"Count caratteri unicode escape DOPO: {after}")

print(f"\nAnalizzando: matchEvent")
before = count_unicode_escaped(matchEvent_df)
print(f"Count caratteri unicode escape PRIMA: {before}")
matchEvent_df = decode_unicode_escaped(matchEvent_df) if before > 0 else matchEvent_df
after = count_unicode_escaped(matchEvent_df) if before > 0 else 0
print(f"Count caratteri unicode escape DOPO: {after}")



Analizzando: club
Count caratteri unicode escape PRIMA: 27
Count caratteri unicode escape DOPO: 0

Analizzando: player
Count caratteri unicode escape PRIMA: 1391
Count caratteri unicode escape DOPO: 0

Analizzando: referee
Count caratteri unicode escape PRIMA: 0
Count caratteri unicode escape DOPO: 0

Analizzando: match
Count caratteri unicode escape PRIMA: 481
Count caratteri unicode escape DOPO: 0

Analizzando: matchEvent
Count caratteri unicode escape PRIMA: 0
Count caratteri unicode escape DOPO: 0


### Handling Invalid Height Values: Replacing Zeros with Nulls

This code identifies and replaces invalid height values in a dataset. Specifically, it targets cases where a height value of `0` is used to represent missing or unknown data — which is not realistic in human measurements. Keeping `0` would skew statistical analysis (e.g., average height), so we replace it with `NaN` to explicitly mark it as missing.

In [10]:
zero_count = (player_df['height'] == 0).sum()
print(f"Numero di valori '0' in 'height' prima della sostituzione: {zero_count}")

player_df['height'] = player_df['height'].replace(0, np.nan)

nan_count = player_df['height'].isna().sum()
print(f"Numero di valori 'NaN' in 'height' dopo della sostituzione: {nan_count}")

Numero di valori '0' in 'height' prima della sostituzione: 77
Numero di valori 'NaN' in 'height' dopo della sostituzione: 77


### Converting Empty Strings to NaN in a DataFrame

In [11]:
club_df = club_df.replace('""', np.nan)
player_df = player_df.replace('""', np.nan)
referee_df = referee_df.replace('""', np.nan)
match_df = match_df.replace('""', np.nan)
matchEvent_df = matchEvent_df.replace('""', np.nan)

Rename `players_id` to `player_id` in `match_event` dataframe for DB consistency.

In [12]:
matchEvent_df = matchEvent_df.rename(columns={"players_id": "player_id"})

### Foreign Key Consistency Check for Missing IDs

After identifying missing IDs in the matchevent data, we perform a thorough check to locate and count the missing IDs in the match dataset. The goal is to clean these inconsistent entries to prevent foreign key violations during database loading, ensuring data integrity.

In [13]:
missing_ids = set(matchEvent_df['match_id']) - set(match_df['id'])
if missing_ids:
    print(f"Attenzione, {len(missing_ids)} match_id mancano in match_df.")
    print(f"Esempio di mancanti: {list(missing_ids)[:10]}")
    print("Eliminazione delle righe in matchEvent_df che non hanno l'id in corso...")
    matchEvent_df = matchEvent_df[~matchEvent_df['match_id'].isin(missing_ids)]
    print("Eliminazione effettuata!")

missing_ids = set(matchEvent_df['player_id']) - set(player_df['id'])
if missing_ids:
    print(f"Attenzione, {len(missing_ids)} player_id mancano in player_df.")
    print(f"Esempio di mancanti: {list(missing_ids)[:10]}")
    print("Eliminazione delle righe in matchEvent_df che non hanno l'id in corso...")
    matchEvent_df = matchEvent_df[~matchEvent_df['player_id'].isin(missing_ids)]
    print("Eliminazione effettuata!")

missing_ids = set(matchEvent_df['club_id']) - set(club_df['id'])
if missing_ids:
    print(f"Attenzione, {len(missing_ids)} club_id mancano in club_df.")
    print(f"Esempio di mancanti: {list(missing_ids)[:10]}")
    print("Eliminazione delle righe in matchEvent_df che non hanno l'id in corso...")
    matchEvent_df = matchEvent_df[~matchEvent_df['club_id'].isin(missing_ids)]
    print("Eliminazione effettuata!")

Attenzione, 486 match_id mancano in match_df.
Esempio di mancanti: [10, 12, 15, 22, 26, 36, 46, 51, 53, 55]
Eliminazione delle righe in matchEvent_df che non hanno l'id in corso...
Eliminazione effettuata!


#### **Attribute List** (after cleaning)

In [14]:
print_info(matchEvent_df, True, "Match_event")
print_info(match_df, True, "Match")
print_info(player_df, True, "Player")
print_info(club_df, True, "Club")
print_info(referee_df, True, "Referee")

### Loading CSV Files into PostgreSQL Tables with pandas and SQLAlchemy

In [15]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/reconciledDatabase')

club_df.to_sql('club', engine, if_exists='append', index=False)
print("✔️ club_df loaded into club")

player_df.to_sql('player', engine, if_exists='append', index=False)
print("✔️ player_df loaded into player")

referee_df.to_sql('referee', engine, if_exists='append', index=False)
print("✔️ referee_df loaded into referee")

match_df.to_sql('match', engine, if_exists='append', index=False)
print("✔️ match_df loaded into match")

matchEvent_df.to_sql('matchevent', engine, if_exists='append', index=False)
print("✔️ matchEvent_df loaded into matchevent")

✔️ club_df loaded into club
✔️ player_df loaded into player
✔️ referee_df loaded into referee
✔️ match_df loaded into match
✔️ matchEvent_df loaded into matchevent


In [16]:
with engine.connect() as conn:
    # club_df
    result = conn.execute(text("SELECT COUNT(*) FROM club"))
    club_rows = result.scalar()
    expected = len(club_df)
    if club_rows >= expected:
        print(f"✔️ Verified: {club_rows} rows in club (expected {expected})")
    else:
        print(f"⚠️ Warning: Only {club_rows} rows in club (expected {expected})")

    # player_df
    result = conn.execute(text("SELECT COUNT(*) FROM player"))
    player_rows = result.scalar()
    expected = len(player_df)
    if player_rows >= expected:
        print(f"✔️ Verified: {player_rows} rows in player (expected {expected})")
    else:
        print(f"⚠️ Warning: Only {player_rows} rows in player (expected {expected})")

    # referee_df
    result = conn.execute(text("SELECT COUNT(*) FROM referee"))
    referee_rows = result.scalar()
    expected = len(referee_df)
    if referee_rows >= expected:
        print(f"✔️ Verified: {referee_rows} rows in referee (expected {expected})")
    else:
        print(f"⚠️ Warning: Only {referee_rows} rows in referee (expected {expected})")

    # match_df
    result = conn.execute(text("SELECT COUNT(*) FROM match"))
    match_rows = result.scalar()
    expected = len(match_df)
    if match_rows >= expected:
        print(f"✔️ Verified: {match_rows} rows in match (expected{expected})")
    else:
        print(f"⚠️ Warning: Only {match_rows} rows in match (expected {expected})")

    # matchEvent_df
    result = conn.execute(text("SELECT COUNT(*) FROM matchEvent"))
    matchEvent_rows = result.scalar()
    expected = len(matchEvent_df)
    if matchEvent_rows >= expected:
        print(f"✔️ Verified: {matchEvent_rows} rows in matchEvent (expected {expected})")
    else:
        print(f"⚠️ Warning: Only {matchEvent_rows} rows in matchEvent (expected {expected})")


✔️ Verified: 142 rows in club (expected 142)
✔️ Verified: 3603 rows in player (expected 3603)
✔️ Verified: 626 rows in referee (expected 626)
✔️ Verified: 1340 rows in match (expected1340)
✔️ Verified: 2090774 rows in matchEvent (expected 2090774)
