# Data Warehouse Project
<p style="text-align: left"><i>Gallo Giovanni 252308</i></p>

## Data quality

### **Imports and Utility functions**

In [11]:
import pandas as pd
import re


def is_valid_dateutc(x):
    try:
        pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S", errors='raise')
        return True
    except (ValueError, TypeError):
        return False
    
def is_valid_date(x):
    try:
        pd.to_datetime(x, format="%Y-%m-%d", errors='raise')
        return True
    except (ValueError, TypeError):
        return False
    
ESCAPE_PATTERN = re.compile(r'[0-9!@#\$%\^&\*\(\)_\+\=\{\}\[\]\\\|;:\",<>\.?/~`€£]')


def has_not_unicode_escape(val):
    return not bool(ESCAPE_PATTERN.search(str(val)))

##### Reading File

In [12]:
files = {
    'club': 'dataset/club.csv',
    'match': 'dataset/match.csv',
    'player': 'dataset/player.csv',
    'referee': 'dataset/referee.csv',
    'match_event': 'dataset/match_event.csv'
}
dataframes = {name: pd.read_csv(path) for name, path in files.items()}

## Validation Rules

In [13]:
VALID_POSITION_PLAYER = {"Goalkeeper", "Defender", "Midfielder", "Forward"}
VALID_FOOT = {"right", "left"}
VALID_MATCHPERIOD = {"1H", "2H"}

VALID_EVENTNAME = {"Others on the ball", "Pass", "Duel", "Free Kick", "Shot", "Save attempt",
                   "Foul", "Offside", "Goalkeeper leaving line", "Interruption"}

VALID_ACTION = {"Touch", "Clearance", "Simple pass", "Ground attacking duel", "Free Kick", "Ground loose ball duel", 
                "Shot", "Air duel", "Ground defending duel", "Hand pass", "Throw in", "High pass", "Launch", "Smart pass",
                "Cross", "Acceleration", "Reflexes", "Head pass", "Free kick cross", "Save attempt","Free kick shot",
                "Corner", "Foul", "Goal kick", "Hand foul", "Goalkeeper leaving line", "Penalty", "Ball out of the field",
                "Violent Foul", "Protest", "Out of game foul", "Simulation", "Time lost foul", "Whistle", "Late card foul"}

VALID_MODIFIER = {"opportunity", "missed ball", "won", "gc", "interception", "neutral", "counter_attack", "keyPass", "high",
                  "gb", "through", "gr", "gbr", "glb", "Right", "Left", "gtr", "gt", "assist", "fairplay", "gl", "gtl", "lost",
                  "blocked", "obr", "otr", "dangerous_ball_lost", "otl", "ot", "olb", "pr", "or", "pbr", "pt", "ol", "pl",
                  "ptl", "ptr", "second_yellow_card", "Goal", "plb", "Feint", "head/body", "red_card", "own_goal",
                  "yellow_card", "free_space_l", "take_on_r"}

VALID_ISSUCCESS = {"t", "f"}



# Definizione del dizionario validators per ciascun file
validators = {
    'club': {
        'id':           lambda x: isinstance(x, int) and x >= 0,
        'name':         lambda x: isinstance(x, str) and len(x.strip()) > 0 and has_not_unicode_escape(x),
        'officialname': lambda x: isinstance(x, str) and len(x.strip()) > 0 and has_not_unicode_escape(x),
        'country':      lambda x: isinstance(x, str) and len(x.strip()) > 0 and has_not_unicode_escape(x)
    },
    'match': {
        'id':           lambda x: isinstance(x, int) and x >= 0,
        'dateutc':      is_valid_dateutc,
        'competition':  lambda x: isinstance(x, str) and len(x) > 0 and has_not_unicode_escape(x),
        'season':       lambda x: isinstance(x, int) and x in (2017, 2018),
        'venue':        lambda x: isinstance(x, str) and len(x) > 0 and has_not_unicode_escape(x),
        'home_club':    lambda x: isinstance(x, str) and len(x.strip()) > 0 and has_not_unicode_escape(x),
        'away_club':    lambda x: isinstance(x, str) and len(x.strip()) > 0 and has_not_unicode_escape(x),
        'winner':       lambda x: isinstance(x, str) and len(x.strip()) > 0 and has_not_unicode_escape(x),
        'goal_by_home_club':   lambda x: isinstance(x, int) and x >= 0,
        'goal_by_away_club':   lambda x: isinstance(x, int) and x >= 0,
        'referee_id':   lambda x: isinstance(x, int) and x > 0,
    },
    'player': {
        'id':           lambda x: isinstance(x, int) and x >= 0,
        'firstname':    lambda x: isinstance(x, str) and len(x) > 0 and has_not_unicode_escape(x),
        'lastname':     lambda x: isinstance(x, str) and len(x) > 0 and has_not_unicode_escape(x),
        'birthdate':    is_valid_date,
        'country':      lambda x: isinstance(x, str) and len(x.strip()) > 0 and has_not_unicode_escape(x),
        'position':     lambda x: x in VALID_POSITION_PLAYER,
        'foot':         lambda x: x in VALID_FOOT,
        'height':       lambda x: isinstance(x, int) and 150 <= x <= 250
    },
    'referee': {
        'id':           lambda x: isinstance(x, int) and x >= 0,
        'firstname':    lambda x: isinstance(x, str) and len(x) > 0 and has_not_unicode_escape(x),
        'lastname':     lambda x: isinstance(x, str) and len(x) > 0 and has_not_unicode_escape(x),
        'birthdate':    is_valid_date,
        'country':      lambda x: isinstance(x, str) and len(x.strip()) > 0 and has_not_unicode_escape(x)
    },
    'match_event': {
        'id':           lambda x: isinstance(x, int) and x >= 0,
        'match_id':     lambda x: isinstance(x, int) and x >= 0,
        'players_id':    lambda x: isinstance(x, int) and x >= 0,
        'club_id':      lambda x: isinstance(x, int) and x >= 0,
        'matchperiod':  lambda x: x in VALID_MATCHPERIOD,
        'eventsec':     lambda x: isinstance(x, (int, float)) and x > 0,
        'eventname':    lambda x: x in VALID_EVENTNAME,
        'action':       lambda x: x in VALID_ACTION,
        'modifier':     lambda x: x in VALID_MODIFIER,
        'x_begin':      lambda x: isinstance(x, (int, float)) and 0 <= x <= 100,
        'y_begin':      lambda x: isinstance(x, (int, float)) and 0 <= x <= 100,
        'x_end':        lambda x: isinstance(x, (int, float)) and 0 <= x <= 100,
        'y_end':        lambda x: isinstance(x, (int, float)) and 0 <= x <= 100,
        'is_success':   lambda x: x in VALID_ISSUCCESS
    }
}

# PRECISION_FUNCS: confronto con ground-truth o ricalcolo
precision_funcs = {
    'match': {
        'winner': lambda val, row: (
            (row['goal_by_home_club'] > row['goal_by_away_club'] and val == row['home_club']) or
            (row['goal_by_away_club'] > row['goal_by_home_club'] and val == row['away_club'])
        )
    }
}

# CONSISTENCY_FUNCS: regole tra tabelle
consistency_funcs = {
    'match': {
        'referee_id': lambda df: df['referee_id'].isin(dataframes['referee']['id'])
    },
    'match_event': {
        'club_id': lambda df: df['club_id'].isin(dataframes['club']['id']),
        'match_id': lambda df: df['match_id'].isin(dataframes['match']['id']),
        'players_id': lambda df: df['players_id'].isin(dataframes['player']['id'])
    }
}

### Compute Data Quality Index
The Data Quality Index are:
- **Validity**: % of data whose values conform with the domain
- **Completeness**: % of not null data
- **Precision**: % of correct data
- **Consistency**: % compatible data
- **Uniqueness**: % of duplicated data

In [14]:
def compute_data_quality(df, valid_funcs, prec_funcs, cons_funcs):
    n = len(df)
    results = []

    for col in df.columns:
        serie = df[col]

        # Completeness
        completeness = round(serie.notna().mean() * 100, 2)

        # Uniqueness 
        if n > 2000000: # vuol dire che è il df match_event 
            uniqueness = round((serie.nunique() / n) * 100, 4)
        else:
            uniqueness = round((serie.nunique() / n) * 100, 2)

        # Validity 
        validity_fn = valid_funcs.get(col)
        validity = round(serie.map(validity_fn).mean() * 100, 2) if validity_fn else None

        # Precision 
        precision_fn = prec_funcs.get(col)
        precision = round(df.apply(lambda row: precision_fn(row[col], row), axis=1).mean() * 100, 2) if precision_fn else None
        
        # Consistency 
        consistency_fn = cons_funcs.get(col)
        consistency = round(consistency_fn(df).mean() * 100, 2) if consistency_fn else None

        results.append({
            'column': col,
            'completeness_%': completeness,
            'uniqueness_%': uniqueness,
            'validity_%': validity,
            'precision_%': precision,
            'consistency_%': consistency
        })

    return pd.DataFrame(results)

### Report Output

In [15]:
dq_reports = {}
for name, df in dataframes.items():
    report = compute_data_quality(
        df,
        valid_funcs=validators.get(name, {}),
        prec_funcs=precision_funcs.get(name, {}),
        cons_funcs=consistency_funcs.get(name, {})
    )
    dq_reports[name] = report
    print(f"\nData Quality per '{name}.csv' (in %):")
    print(report)


Data Quality per 'club.csv' (in %):
         column  completeness_%  uniqueness_%  validity_% precision_%  \
0            id           100.0        100.00      100.00        None   
1          name           100.0        100.00       91.55        None   
2  officialname           100.0        100.00       80.28        None   
3       country           100.0         31.69      100.00        None   

  consistency_%  
0          None  
1          None  
2          None  
3          None  

Data Quality per 'match.csv' (in %):
               column  completeness_%  uniqueness_%  validity_%  precision_%  \
0                  id           100.0        100.00      100.00          NaN   
1             dateutc           100.0         58.73      100.00          NaN   
2         competition           100.0          0.37      100.00          NaN   
3              season           100.0          0.15      100.00          NaN   
4               venue           100.0          7.09       78.81      