# 2b. ERROR DETECTION AND CORRECTION

This notebook handles:
1. **Missing Values Analysis** - Identify patterns of missing data
2. **Missing Values Imputation** - Fill missing values using domain knowledge and data patterns
3. **Outlier Detection** - Identify potential outliers in numerical columns
4. **Outlier Handling** - Decide on treatment strategies

## 2b.1 Imports and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 140)
%matplotlib inline

In [None]:
# Load the transformed dataset from previous step
# If running independently, load the original
try:
    MILANO = pd.read_csv("MILANO_transformed.csv", sep=";")
    print("Loaded transformed dataset")
except FileNotFoundError:
    MILANO = pd.read_csv("Comune-di-Milano-Pubblici-esercizi(in)-2.csv", sep=";")
    print("Loaded original dataset")

print(f"Shape: {MILANO.shape}")
MILANO.head()

---
# 1. MISSING VALUES ANALYSIS

## 1.1 Overview of Missing Values

In [None]:
# Count missing values per column
null_counts = MILANO.isnull().sum()
null_pct = (null_counts / len(MILANO) * 100).round(2)

missing_df = pd.DataFrame({
    'Column': null_counts.index,
    'Missing Count': null_counts.values,
    'Missing %': null_pct.values
}).sort_values('Missing %', ascending=False)

missing_df[missing_df['Missing Count'] > 0]

In [None]:
# Visualize missing values
missing_cols = missing_df[missing_df['Missing Count'] > 0]['Column'].tolist()

if missing_cols:
    plt.figure(figsize=(10, 6))
    plt.barh(missing_df[missing_df['Missing Count'] > 0]['Column'], 
             missing_df[missing_df['Missing Count'] > 0]['Missing %'])
    plt.xlabel('Missing %')
    plt.title('Missing Values by Column')
    plt.tight_layout()
    plt.show()

---
# 2. MISSING VALUES IMPUTATION

## 2.1 Handle 'Insegna' (Business Name)

The `Insegna` column has ~50% missing values. Since we cannot infer business names, we fill with "Unknown".

In [None]:
if 'Insegna' in MILANO.columns:
    print(f"Insegna - Missing before: {MILANO['Insegna'].isna().sum()}")
    MILANO['Insegna'] = MILANO['Insegna'].fillna('unknown')
    print(f"Insegna - Missing after: {MILANO['Insegna'].isna().sum()}")

## 2.2 Handle 'Superficie somministrazione'

Fill missing surface values with the mean.

In [None]:
sup_col = 'Superficie somministrazione'

if sup_col in MILANO.columns:
    # Ensure numeric
    MILANO[sup_col] = pd.to_numeric(MILANO[sup_col], errors='coerce')
    
    print(f"{sup_col} - Missing before: {MILANO[sup_col].isna().sum()}")
    
    # Fill with mean
    superficie_mean = MILANO[sup_col].mean()
    MILANO[sup_col] = MILANO[sup_col].fillna(superficie_mean)
    
    print(f"{sup_col} - Missing after: {MILANO[sup_col].isna().sum()}")
    print(f"Mean used for imputation: {superficie_mean:.2f}")

## 2.3 Handle 'Forma commercio prev' using Macro-Category

We use the relationship between macro-categories and commerce forms to impute missing values, but only when confidence is >= 80%.

In [None]:
prev_col = 'Forma commercio prev'
macro_col = 'Tipo_macro' if 'Tipo_macro' in MILANO.columns else None
soglia = 0.80

if prev_col in MILANO.columns and macro_col:
    # Normalize values
    MILANO[prev_col] = MILANO[prev_col].astype(str).str.strip().str.lower().replace('nan', np.nan)
    MILANO[macro_col] = MILANO[macro_col].astype(str).str.strip().str.upper().replace('nan', np.nan)
    
    # Build mapping: macro -> dominant prev value + confidence
    tmp = MILANO[[macro_col, prev_col]].dropna(subset=[macro_col, prev_col]).copy()
    
    counts = (
        tmp.groupby([macro_col, prev_col])
           .size()
           .rename('n')
           .reset_index()
    )
    
    tot = counts.groupby(macro_col)['n'].sum().rename('tot').reset_index()
    counts = counts.merge(tot, on=macro_col, how='left')
    counts['conf'] = counts['n'] / counts['tot']
    
    best = (
        counts.sort_values(['conf', 'n'], ascending=False)
              .drop_duplicates(subset=[macro_col])
              [[macro_col, prev_col, 'n', 'tot', 'conf']]
              .sort_values('conf', ascending=False)
    )
    
    print(f"Macro-categories with confidence >= {soglia}:")
    display(best[best['conf'] >= soglia])
    
    # Imputation
    mode_map = dict(zip(best[macro_col], best[prev_col]))
    conf_map = dict(zip(best[macro_col], best['conf']))
    
    prev_na_before = MILANO[prev_col].isna().sum()
    
    MILANO['_prev_mode'] = MILANO[macro_col].map(mode_map)
    MILANO['_prev_conf'] = MILANO[macro_col].map(conf_map)
    
    mask_fill = MILANO[prev_col].isna() & (MILANO['_prev_conf'] >= soglia)
    MILANO.loc[mask_fill, prev_col] = MILANO.loc[mask_fill, '_prev_mode']
    
    prev_na_after = MILANO[prev_col].isna().sum()
    
    # Cleanup
    MILANO = MILANO.drop(columns=['_prev_mode', '_prev_conf'])
    
    print(f"\n{prev_col} - Missing before: {prev_na_before}")
    print(f"{prev_col} - Imputed rows: {mask_fill.sum()}")
    print(f"{prev_col} - Missing after: {prev_na_after}")

## 2.4 Handle 'Forma commercio' using 'Forma commercio prev'

In [None]:
new_col = 'Forma commercio'
prev_col = 'Forma commercio prev'

if new_col in MILANO.columns and prev_col in MILANO.columns:
    # Normalize
    MILANO[new_col] = MILANO[new_col].astype(str).str.strip().str.lower().replace('nan', np.nan)
    MILANO[prev_col] = MILANO[prev_col].astype(str).str.strip().str.lower().replace('nan', np.nan)
    
    # Build mapping: prev -> new dominant value
    dist = pd.crosstab(MILANO[prev_col], MILANO[new_col], normalize='index')
    new_mode = dist.idxmax(axis=1)
    conf = dist.max(axis=1)
    
    print("Mapping prev -> new (with confidence):")
    display(pd.DataFrame({'mode': new_mode, 'confidence': conf}))
    
    # Impute only where new is NaN, prev is known, and conf >= threshold
    mask_fill = (
        MILANO[new_col].isna() &
        MILANO[prev_col].notna() &
        (MILANO[prev_col].map(conf) >= soglia)
    )
    
    new_na_before = MILANO[new_col].isna().sum()
    MILANO.loc[mask_fill, new_col] = MILANO.loc[mask_fill, prev_col].map(new_mode)
    new_na_after = MILANO[new_col].isna().sum()
    
    print(f"\n{new_col} - Missing before: {new_na_before}")
    print(f"{new_col} - Imputed rows: {mask_fill.sum()}")
    print(f"{new_col} - Missing after: {new_na_after}")

## 2.5 Handle 'Forma vendita'

Since there's no strong deterministic relationship, we treat missing as a valid category.

In [None]:
vend_col = 'Forma vendita'

if vend_col in MILANO.columns:
    print(f"{vend_col} - Missing before: {MILANO[vend_col].isna().sum()}")
    
    # Create filled version with explicit 'non dichiarata' category
    MILANO[vend_col + '_filled'] = MILANO[vend_col].fillna('non dichiarata')
    
    print(f"{vend_col}_filled distribution:")
    display(MILANO[vend_col + '_filled'].value_counts())

---
# 3. OUTLIER DETECTION

## 3.1 Identify Numerical Columns

In [None]:
NUM = MILANO.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f"Numerical columns: {NUM}")

## 3.2 Statistical Summary

In [None]:
MILANO[NUM].describe()

## 3.3 IQR Method for Outlier Detection

In [None]:
def detect_outliers_iqr(df, column):
    """Detect outliers using IQR method."""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    
    return {
        'column': column,
        'Q1': Q1,
        'Q3': Q3,
        'IQR': IQR,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'n_outliers': len(outliers),
        'pct_outliers': len(outliers) / len(df) * 100
    }

In [None]:
# Apply IQR detection to numerical columns
outlier_summary = []
for col in NUM:
    if MILANO[col].notna().sum() > 0:
        result = detect_outliers_iqr(MILANO, col)
        outlier_summary.append(result)

outlier_df = pd.DataFrame(outlier_summary)
outlier_df

## 3.4 Visualize Distributions with Boxplots

In [None]:
# Boxplot for Superficie somministrazione
sup_col = 'Superficie somministrazione'

if sup_col in MILANO.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Boxplot
    axes[0].boxplot(MILANO[sup_col].dropna())
    axes[0].set_title(f'Boxplot of {sup_col}')
    axes[0].set_ylabel('Value')
    
    # Histogram
    MILANO[sup_col].hist(bins=50, ax=axes[1])
    axes[1].set_title(f'Distribution of {sup_col}')
    axes[1].set_xlabel('Value')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Boxplots for all numerical columns
if len(NUM) > 0:
    fig, axes = plt.subplots(1, len(NUM), figsize=(5*len(NUM), 5))
    if len(NUM) == 1:
        axes = [axes]
    
    for i, col in enumerate(NUM):
        data = MILANO[col].dropna()
        if len(data) > 0:
            axes[i].boxplot(data)
            axes[i].set_title(col)
    
    plt.tight_layout()
    plt.show()

## 3.5 Z-Score Method for Outlier Detection

In [None]:
from scipy import stats

def detect_outliers_zscore(df, column, threshold=3):
    """Detect outliers using Z-score method."""
    data = df[column].dropna()
    z_scores = np.abs(stats.zscore(data))
    outliers = data[z_scores > threshold]
    
    return {
        'column': column,
        'threshold': threshold,
        'n_outliers': len(outliers),
        'pct_outliers': len(outliers) / len(data) * 100 if len(data) > 0 else 0
    }

In [None]:
# Apply Z-score detection
zscore_summary = []
for col in NUM:
    if MILANO[col].notna().sum() > 0:
        result = detect_outliers_zscore(MILANO, col)
        zscore_summary.append(result)

zscore_df = pd.DataFrame(zscore_summary)
zscore_df

## 3.6 Outlier Handling Strategies

For this dataset, we keep outliers as they may represent legitimate large establishments.

In [None]:
# Flag potential outliers without removing them
sup_col = 'Superficie somministrazione'

if sup_col in MILANO.columns:
    Q1 = MILANO[sup_col].quantile(0.25)
    Q3 = MILANO[sup_col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    MILANO['is_outlier_superficie'] = (
        (MILANO[sup_col] < lower) | (MILANO[sup_col] > upper)
    )
    
    print(f"Outliers flagged: {MILANO['is_outlier_superficie'].sum()}")
    print(f"\nOutlier statistics for {sup_col}:")
    print(f"  Lower bound: {lower:.2f}")
    print(f"  Upper bound: {upper:.2f}")
    print(f"  Max value in outliers: {MILANO[MILANO['is_outlier_superficie']][sup_col].max():.2f}")

---
# 4. FINAL SUMMARY AND SAVE

In [None]:
# Final missing values check
null_counts_after = MILANO.isnull().sum()
null_pct_after = (null_counts_after / len(MILANO) * 100).round(2)

final_missing = pd.DataFrame({
    'Column': null_counts_after.index,
    'Missing Count': null_counts_after.values,
    'Missing %': null_pct_after.values
}).sort_values('Missing %', ascending=False)

print("=== FINAL MISSING VALUES STATUS ===")
display(final_missing[final_missing['Missing Count'] > 0])

In [None]:
# Summary of changes
print("=== CHANGES APPLIED ===")
print("1. Filled 'Insegna' missing values with 'unknown'")
print("2. Filled 'Superficie somministrazione' missing with mean")
print("3. Imputed 'Forma commercio prev' using macro-category (conf >= 80%)")
print("4. Imputed 'Forma commercio' using 'Forma commercio prev'")
print("5. Created 'Forma vendita_filled' with 'non dichiarata' for missing")
print("6. Flagged surface outliers without removing them")
print(f"\nFinal shape: {MILANO.shape}")

In [None]:
# Save the cleaned dataset
MILANO.to_csv("MILANO_cleaned.csv", index=False, sep=";")
print("Saved: MILANO_cleaned.csv")

In [None]:
# Preview
MILANO.head()