In [8]:
def load_data(file_path: str) -> pd.DataFrame:
    try:
        df = pd.read_excel(file_path)
        print("DataFrame successfully created:")
        return df
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Perhaps the wrong filepath?")
    except Exception as e:
        print(f"An error occurred: {e}")

In [9]:
kyverna_df = load_data("/Users/lilblick/Downloads/kyverna_study_data.xlsx")
kyverna_df.head()

DataFrame successfully created:


Unnamed: 0,UniqueSubjectIdentifier,Site.ID,Cohort,Parameter,Analysis.Visit,Analysis.Value,Analysis.RelativeDay,Date,Dosing.Date,CTCAE.Grade
0,840002-001,840002,ABC101-001,Abs. Lymphocytes (x10^9/L),Screening,1.19,-50,2023-08-07,2023-09-26,Gr2 or lower or normal
1,840002-001,840002,ABC101-001,Abs. Neutrophils (x10^9/L),Screening,5.62,-50,2023-08-07,2023-09-26,Gr2 or lower or normal
2,840002-001,840002,ABC101-001,WBC (x10^9/L),Screening,7.21,-50,2023-08-07,2023-09-26,Gr2 or lower or normal
3,840002-001,840002,ABC101-001,Abs. Lymphocytes (x10^9/L),Aph,0.67,-36,2023-08-21,2023-09-26,Gr2 or lower or normal
4,840002-001,840002,ABC101-001,Abs. Neutrophils (x10^9/L),Aph,4.92,-36,2023-08-21,2023-09-26,Gr2 or lower or normal


In [None]:
def clean_dataframe(
    df: pd.DataFrame,
    fill_missing: str = "constant",
    constant_fill: str | int | float = "unknown",
    fuzzy_clean: bool = False,
    fuzzy_column: str | None = None,
    valid_values: list[str] | None = None
) -> pd.DataFrame:
    """
    Cleans a DataFrame for analysis:
    - Strips whitespace and normalizes casing
    - Handles nulls with configurable strategy
    - Fixes spacing and unwanted characters
    - Standardizes numeric/date types
    - Drops duplicates
    - (Optional) Fuzzy matches values in a specific column

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe
    fill_missing : str
        Strategy for filling missing values
    constant_fill : str/int/float
        Value used if fill_missing="constant"
    fuzzy_clean : bool
        Whether to apply fuzzy string matching cleanup
    fuzzy_column : str
        Column to apply fuzzy matching on (required if fuzzy_clean=True)
    valid_values : list[str]
        Allowed values for fuzzy cleaning

    Returns
    -------
    pd.DataFrame
    """

    df = df.copy()
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].str.lower()
        df[col] = df[col].str.replace(r"\s+", " ", regex=True)
        df[col] = df[col].str.replace(r"[^\x20-\x7E]", "", regex=True)

    # Handle missing values
    for col in df.columns:
        if df[col].isnull().any():
            if fill_missing == "mode":
                df[col].fillna(df[col].mode(dropna=True)[0] if not df[col].mode().empty else constant_fill, inplace=True)
            elif fill_missing == "mean" and pd.api.types.is_numeric_dtype(df[col]):
                df[col].fillna(df[col].mean(), inplace=True)
            elif fill_missing == "median" and pd.api.types.is_numeric_dtype(df[col]):
                df[col].fillna(df[col].median(), inplace=True)
            elif fill_missing == "constant":
                df[col].fillna(constant_fill, inplace=True)
            elif fill_missing == "drop":
                df = df.dropna(subset=[col])

    # Optional fuzzy cleaning
    if fuzzy_clean and fuzzy_column and valid_values:
        df[f"{fuzzy_column}_cleaned"] = df[fuzzy_column].apply(
            lambda x: process.extractOne(x, valid_values)[0] if isinstance(x, str) else x
        )

    # Standardize data types
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = pd.to_numeric(df[col], errors="ignore")
        if df[col].dtype == "object":
            df[col] = pd.to_datetime(df[col], errors="ignore")

    df.reset_index(drop=True, inplace=True)
    return df


def main():
    df = load_data("/Users/lilblick/Downloads/kyverna_study_data.xlsx")
    if df is not None:
        cleaned = clean_dataframe(df, fill_missing="median", fuzzy_clean=True, fuzzy_column="fruit",
                                  valid_values=["apple", "banana", "orange"])
        print(cleaned.head())
