In [1]:
import pandas as pd

In [2]:
def percentile_outliers(df):
    outliers_df = pd.DataFrame(columns=['driverId', 'year', 'place'])

    percentile_place_low = 0
    percentile_place_high = 80 

    for driver_id in df['driverId'].unique():
        driver_data = df[df['driverId'] == driver_id]
        
        if driver_data['year'].nunique() < 3:
            continue

        place_percentile_low = driver_data['place'].quantile(percentile_place_low / 100)
        place_percentile_high = driver_data['place'].quantile(percentile_place_high / 100)

        driver_outliers_low = driver_data[driver_data['place'] < place_percentile_low]
        driver_outliers_high = driver_data[driver_data['place'] > place_percentile_high]

        driver_outliers_low.loc[:, 'driverId'] = driver_id
        driver_outliers_high.loc[:, 'driverId'] = driver_id

        outliers_df = pd.concat([outliers_df, driver_outliers_low, driver_outliers_high], ignore_index=True)

    return outliers_df


In [3]:
from scipy.stats import zscore

def zscore_outliers(df, weight_points=0.3, weight_place=0.7, zscore_threshold=2):
    outliers_df = pd.DataFrame(columns=['driverId', 'year', 'composite_metric', 'z_score'])

    for driver_id in df['driverId'].unique():
        driver_data = df[df['driverId'] == driver_id].copy()
        
        if len(driver_data) > 2:
            driver_data['composite_metric'] = driver_data['points'] * weight_points + driver_data['place'] * weight_place

            if driver_data['composite_metric'].std() != 0:
                z_scores = zscore(driver_data['composite_metric'])
                
                driver_outliers = driver_data[abs(z_scores) > zscore_threshold]

                driver_outliers['driverId'] = driver_id

                outliers_df = pd.concat([outliers_df, driver_outliers], ignore_index=True)

    return outliers_df


In [4]:
def iqr_outliers(df, weight_points=0.3, weight_place=0.7, iqr_multiplier=2.5):
    outliers_df = pd.DataFrame(columns=['driverId', 'year', 'composite_metric'])

    for driver_id in df['driverId'].unique():
        driver_data = df[df['driverId'] == driver_id].copy()

        if len(driver_data) > 1:
            driver_data['composite_metric'] = driver_data['points'] * weight_points + driver_data['place'] * weight_place

            Q1 = driver_data['composite_metric'].quantile(0.25)
            Q3 = driver_data['composite_metric'].quantile(0.75)
            IQR = Q3 - Q1

            driver_outliers = driver_data[(driver_data['composite_metric'] < Q1 - iqr_multiplier * IQR) | (driver_data['composite_metric'] > Q3 + iqr_multiplier * IQR)]

            driver_outliers['driverId'] = driver_id

            outliers_df = pd.concat([outliers_df, driver_outliers], ignore_index=True)

    return outliers_df


In [5]:
def remove_outliers(main_df, outliers_df, key_columns=None):
    # If key_columns is not provided, use default key columns
    if key_columns is None:
        key_columns = ['driverId', 'year']

    # Check if key columns exist in both DataFrames
    for key in key_columns:
        if key not in main_df.columns:
            raise KeyError(f"Key column '{key}' not found in main_df")
        if key not in outliers_df.columns:
            raise KeyError(f"Key column '{key}' not found in outliers_df")

    # Identify rows to be removed in main_df
    rows_to_remove = main_df.merge(outliers_df[key_columns], on=key_columns, how='inner').index

    # Remove identified rows
    cleaned_df = main_df.drop(index=rows_to_remove)

    return cleaned_df