## A1 Utils

Test whether there are symbols in numerics

In [None]:
# Function 1: Using isdigit to find non-numerical entries
def find_non_digits(s):
    # Ensure the value is treated as a string
    s = str(s)
    return [char for char in s if not (char.isdigit() or char == ".")]


# Function 2 : Calculate the score
def accuracy_score(dataset_path, selected_columns):
    adf = read_data(dataset_path)

    # Check if all specified columns were extracted, if not raise Key error
    for column in selected_columns:
        if column not in adf.columns:
            raise KeyError(column)
        
    selected_columns = [col for col in adf.columns if col in selected_columns]

    all_accuracy_scores = []

    for column_name in selected_columns:
        # Drop NA, null, or blank values from column
        column_data = adf[column_name].dropna()

        total_rows = len(column_data)

        if total_rows > 0:  # to avoid division by zero
            non_digit_chars_per_row = column_data.apply(find_non_digits)
            non_numerical_count = non_digit_chars_per_row.apply(
                lambda x: len(x) > 0
            ).sum()
            accuracy_score = (total_rows - non_numerical_count) / total_rows
            all_accuracy_scores.append(accuracy_score)

    overall_accuracy_score = (
        sum(all_accuracy_scores) / len(all_accuracy_scores)
        if all_accuracy_scores
        else None
    )

    # log the results
    log_score(
        test_name="Accuracy (A1)",
        dataset_name=get_dataset_name(dataset_path),
        selected_columns=selected_columns,
        threshold=None,
        score=overall_accuracy_score,
    )

    return overall_accuracy_score

## A2 Utils

In [None]:
def find_outliers_iqr(
    dataset_path,
    selected_columns,
    groupby_column=None,
    threshold=1.5,
    minimum_score=0.85,
):
    df = read_data(dataset_path)

    outliers_dict = {}

    # Ensure selected columns are numeric  
    for column in selected_columns:  
        df[column] = df[column].astype(str).str.replace(r'[^\d.-]', '', regex=True)  
        df[column] = pd.to_numeric(df[column], errors='coerce')  

    # If a groupby column is specified, perform the IQR calculation within each group
    if groupby_column:
        grouped = df.groupby(groupby_column)
        for column in selected_columns:
            # Apply the outlier detection for each group
            outliers = grouped[column].apply(
                lambda x: (
                    (
                        x
                        < x.quantile(0.25)
                        - threshold * (x.quantile(0.75) - x.quantile(0.25))
                    )
                    | (
                        x
                        > x.quantile(0.75)
                        + threshold * (x.quantile(0.75) - x.quantile(0.25))
                    )
                )
            )
            # Combine the outlier Series into a single Series that corresponds to the original DataFrame index
            outliers_dict[column] = 1 - outliers.groupby(groupby_column).mean()
    else:
        # Perform the IQR calculation on the whole column if no groupby column is specified
        for column in selected_columns:
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
            outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
            outliers_dict[column] = 1 - outliers.mean()

    #compute final score
    total_groups = len(outliers_dict)
    groups_above = sum(1 for score in outliers_dict.values() if score > minimum_score)
    final_score = groups_above / total_groups if total_groups > 0 else 0

    #final_score = {}

    # for key in outliers_dict.keys():
    #     print(outliers_dict[key])
    #     arr = outliers_dict[key].values
    #     value_out = np.sum(arr > minimum_score) / len(arr)
    #     final_score[key] = value_out
    
    # for key, value in outliers_dict.items():  
    #     print(key, value)
    #     # Check if the proportion of non-outliers is greater than the minimum score  
    #     value_out = value > minimum_score  
    #     # Store the result (True or False) in the final_score dictionary  
    #     final_score[key] = value_out  

    # log the results

    log_score(
        test_name="Accuracy (A2)",
        dataset_name=get_dataset_name(dataset_path),
        selected_columns=selected_columns,
        threshold=threshold,
        score=final_score,
    )

    return outliers_dict, final_score

## A3 Utils

In [None]:
# function 1: finding duplicates
def find_duplicates_and_percentage(dataset_path):

    df = read_data(dataset_path)

    # Find duplicate rows
    duplicate_rows = df[df.duplicated(keep=False)]

    # Calculate percentage of duplicate rows
    total_rows = len(df)
    total_duplicate_rows = len(duplicate_rows)
    percentage_duplicate = 1 - (total_duplicate_rows / total_rows)

    # Print duplicate rows
    print("Duplicate Rows:")
    print(duplicate_rows)

    # log the results
    log_score(
        test_name="Accuracy (A3)",
        dataset_name=get_dataset_name(dataset_path),
        selected_columns=None,
        threshold=None,
        score=percentage_duplicate,
    )

    # Print percentage of duplicate rows
    print(f"\nDuplication Score: {percentage_duplicate*100}%")