In [1]:
import pandas as pd
import pyarrow.feather as feather
import matplotlib.pyplot as plt
import numpy as np

### Processing functions

of the path data, and in general the whole loading and convertion into DF should be in a .py file.
Or maybe not?

---


# Played Path Data Filtering (Outlier removal)

**Finished Paths**
- Fitler for path length  
    So far, filtered using IQR method with full_path_length - distance to detect outliers  
    Alternatively, could do filtering for each distance using the IQR method (like I do for speed)
- Filtering for path speed
    Group articles by distance. For each distance filter out slow games usingn the IQR method on that group.  
    Also fitler out start-target pairs that have been played more than one time per player (IpAdress).  

**Unfinished Paths**
- Filter for path length  
    So far, filter using the IQR method for full_path_length  
    and lower bound = shortest path distance  
    Need to remove timeouts!

In [2]:
def filter_finished_paths(finished_paths_df, full_path_col='full_path_length', distance_col='distance', multiplier=1.5):
    """
    downsample paths where the same start target pair has been played a lot more often than others, 
    and those wehre it has been played multiple times by the same player (same IpAddress and identifier).
    Then filters paths based on the full path length for each unique distance value, using the IQR method to identify and remove outliers.
    
    Parameters:
    - finished_paths (pd.DataFrame): The input DataFrame containing path data.
    - full_path_col (str): The column name for the full path length. Default is 'full_path_length'.
    - distance_col (str): The column name for the distance. Default is 'distance'.
    - multiplier (float): The multiplier for the IQR to determine the bounds. Default is 1.5.
    
    Returns:
    - filtered_finished_paths (pd.DataFrame): The filtered DataFrame without outliers.
    - removed_count (int): The number of rows removed.
    - removed_percentage (float): The percentage of rows removed.
    """
    filtered_dfs = []  # List to hold filtered data for each distance group

    # first remove all paths where the start target pair has been played by the same player (same IpAddress and identifier)
    finished_paths = finished_paths_df.groupby(['hashedIpAddress', 'identifier']).sample(n=1, random_state=42)

    # ------------------------------------

    # Downsample the paths so that the same start-target pair is not played more than 5 times

    # Set the random seed for reproducibility
    np.random.seed(42)

    # Shuffle the DataFrame
    shuffled_df = finished_paths.sample(frac=1).reset_index(drop=True)

    # Group by 'identifier' and keep at most the first 3 rows for each group
    filtered_paths_sampled = (
        shuffled_df.groupby('identifier')
        .head(5)
        .reset_index(drop=True)
    )

    finished_paths = filtered_paths_sampled

    # ------------------------------------

    # Apply the IQR method


    # Iterate over each unique distance
    for distance in finished_paths[distance_col].unique():
        # Subset the DataFrame for the current distance group
        df_subset = finished_paths[finished_paths[distance_col] == distance]

        # Compute IQR for the full path length
        Q1 = df_subset[full_path_col].quantile(0.25)
        Q3 = df_subset[full_path_col].quantile(0.75)
        IQR = Q3 - Q1

        # Calculate lower and upper bounds based on IQR
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR

        # Filter rows within the bounds
        filtered_df = df_subset[(df_subset[full_path_col] >= lower_bound) & (df_subset[full_path_col] <= upper_bound)]

        # Append filtered data for this group to the list
        filtered_dfs.append(filtered_df)

    # Concatenate all filtered groups
    filtered_finished_paths = pd.concat(filtered_dfs, ignore_index=True)

    # Calculate the number of removed rows and the percentage
    removed_count = finished_paths_df.shape[0] - filtered_finished_paths.shape[0]
    removed_percentage = (removed_count / finished_paths_df.shape[0]) * 100

    # Print the summary
    print(f"A total of {removed_count} paths were removed from the finished paths, using IQR filtering on {full_path_col}, "
          f"which represents {removed_percentage:.3f}% of the original finished data.")

    return filtered_finished_paths


In [3]:
def filter_unfinished_paths(unfinished_paths_df, full_path_col='full_path_length', simplified_path_col='simplified_path_length', distance_col='distance', multiplier=1.5):
    """
    Filters unfinished paths based on the IQR method for each distance group. 
    The lower bound is the distance itself, and the upper bound is determined using the IQR method.

    Parameters:
    - unfinished_paths (pd.DataFrame): The input DataFrame containing unfinished path data.
    - full_path_col (str): The column name for the full path length. Default is 'full_path_length'.
    - simplified_path_col (str): The column name for the simplified path length. Default is 'simplified_path_length'.
    - distance_col (str): The column name for the distance. Default is 'distance'.
    - multiplier (float): The multiplier for the IQR to determine the upper bound. Default is 1.5.

    Returns:
    - filtered_unfinished_paths (pd.DataFrame): The filtered DataFrame without outliers.
    - removed_count (int): The number of rows removed.
    - removed_percentage (float): The percentage of rows removed.
    """
    
    # First remove the paths that player did not actively fail (timeout)
    unfinished_paths = unfinished_paths_df[~(unfinished_paths_df['failure_reason'] == 'timeout')]

    filtered_dfs = []  # List to hold filtered data for each distance group

    # Iterate over each unique distance
    for distance_value in unfinished_paths[distance_col].unique():
        # Subset the DataFrame for the current distance group
        df_subset = unfinished_paths[unfinished_paths[distance_col] == distance_value]

        # Compute IQR for the full path length
        Q1 = df_subset[full_path_col].quantile(0.25)
        Q3 = df_subset[full_path_col].quantile(0.75)
        IQR = Q3 - Q1

        # Calculate the upper bound based on IQR
        upper_bound = Q3 + multiplier * IQR

        # Apply the filtering conditions
        filtered_df = df_subset[
            (df_subset[full_path_col] <= upper_bound) &  # Full path length <= upper bound
            (df_subset[simplified_path_col] >= df_subset[distance_col])  # Simplified path length >= distance
        ]

        # Append filtered data for this group to the list
        filtered_dfs.append(filtered_df)

    # Concatenate all filtered groups
    filtered_unfinished_paths = pd.concat(filtered_dfs, ignore_index=True)

    # Calculate the number of removed rows and the percentage
    removed_count = unfinished_paths_df.shape[0] - filtered_unfinished_paths.shape[0]
    removed_percentage = (removed_count / unfinished_paths_df.shape[0]) * 100

    # Print the summary
    print(f"A total of {removed_count} paths were removed from the unfinished paths, "
          f"which represents {removed_percentage:.3f}% of the original unfinished data.")

    return filtered_unfinished_paths

In [5]:
paths = feather.read_feather('Data/dataframes/paths.feather')

finished_paths = paths[paths['finished'] == True]
unfinished_paths = paths[paths['finished'] == False]

filtered_finished_paths = filter_finished_paths(finished_paths)
filtered_unfinished_paths = filter_unfinished_paths(unfinished_paths)

print("A total of {} finished paths and {} unfinished paths remain after filtering.".format(filtered_finished_paths.shape[0], filtered_unfinished_paths.shape[0]))

A total of 10786 paths were removed from the finished paths, using IQR filtering on full_path_length, which represents 21.018% of the original finished data.
A total of 18254 paths were removed from the unfinished paths, which represents 73.383% of the original unfinished data.
A total of 40532 finished paths and 6621 unfinished paths remain after filtering.


In [12]:
filtered_paths = pd.concat([filtered_finished_paths, filtered_unfinished_paths], ignore_index=True)
filtered_paths.value_counts('finished')

filtered_paths.to_feather('Data/dataframes/filtered_paths.feather')