# Environment Setup

In [None]:
%pip install missingno matplotlib

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from IPython.core.interactiveshell import InteractiveShell
from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
# display scientific notation as a float
pd.set_option("display.float_format", lambda x: "%.3f" % x)
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="white", rc=custom_params, palette="Set2")

encodings = {}


In [None]:
import data_cleaning_lib as lib


# Part III: Data Cleaning
---

## D1: Cleaning Findings

In [None]:
filename = "./src/Medical Data/medical_raw_data.csv"
df = pd.read_csv(filename)
treated_df = df.copy()


### Duplicate Treatment

In [None]:
treated_df = treated_df.drop_duplicates(subset=["Customer_id", "Interaction", "UID"])
assert treated_df.duplicated(subset=["Customer_id", "Interaction", "UID"]).sum() == 0


### Missing Value Treatment

In [None]:
null_df = treated_df.isnull().sum()
null_counts_df = null_df[null_df > 0].sort_values(ascending=False)

# store the list of column names with missing values
missing_cols = null_counts_df.index

null_counts_df


#### Children Imputation Comps

In [None]:
n = 100
col = "Children"
imp_df = df[col].head(n)

imputations = {
    f"Untreated {col}": imp_df,
    # f"BFILL (next observed value) {col} Imputation": imp_df.fillna(method="bfill"),
    # f"FFILL (last observed value) {col} Imputation": imp_df.fillna(method="ffill"),
    f"MODE {col} Imputation": imp_df.fillna(df[col].mode()[0]),
    f"MEAN {col} Imputation": imp_df.fillna(df[col].mean()),
    f"MEDIAN {col} Imputation": imp_df.fillna(df[col].median()),
    f"CONSTANT (0) {col} Imputation": imp_df.fillna(0),
    # f"Nearest Observable Value Imputation": imp_df.interpolate(method="nearest"),
    # f"Quadratic Imputation": imp_df.interpolate(method="quadratic"),
    # f"Linear Imputation": imp_df.interpolate(method="linear"),
}

fig, axes = plt.subplots(len(imputations), 1, figsize=(30, 40))

for ax, (k, v) in zip(axes, imputations.items()):
    _ = v.plot(title=k, color="red", marker="o", linestyle="dotted", ax=ax)
    _ = df[col].head(n).plot(marker="o", ax=ax)


#### Age Imputation Comps

In [None]:
n = 1000
col = "Age"
imp_df = df[col].head(n)

imputations = {
    f"Untreated {col}": imp_df,
    f"MEAN {col} Imputation": imp_df.fillna(df[col].mean()),
    f"MEDIAN {col} Imputation": imp_df.fillna(df[col].median()),
}

fig, axes = plt.subplots(len(imputations), 1, figsize=(30, 40))

for ax, (k, v) in zip(axes, imputations.items()):
    _ = v.plot(title=k, color="red", marker=".", linestyle="dotted", ax=ax)
    _ = df[col].head(n).plot(marker=".", ax=ax)


#### Initial_days Imputation Comps

In [None]:
col = "Initial_days"
rows = range(4800, 5200)
imp_df = df[col].loc[rows]

imputations = {
    f"Untreated {col}": imp_df,
    f"MODE {col} Imputation": imp_df.fillna(df[col].mode()[0]),
    f"MEAN {col} Imputation": imp_df.fillna(df[col].mean()),
    f"MEDIAN {col} Imputation": imp_df.fillna(df[col].median()),
}

fig, axes = plt.subplots(len(imputations), 1, figsize=(30, 70))

for ax, (k, v) in zip(axes, imputations.items()):
    _ = v.plot(title=k, color="red", marker="o", linestyle="dotted", ax=ax)
    _ = df[col].loc[rows].plot(marker="o", ax=ax)


#### Imputations

In [None]:
# uniform and multimodal distributions (mean)
for col in ["Age", "Initial_days"]:
    treated_df[col] = df[col].fillna(df[col].mean())

# skewed distributions (median)
for col in ["Income"]:
    treated_df[col] = df[col].fillna(df[col].median())

# binomial distributions (mode)
for col in ["Anxiety", "Overweight"]:
    treated_df[col] = df[col].fillna(df[col].mode()[0])

# Use Constant Value
treated_df.loc[treated_df["Soft_drink"].isnull(), ["Soft_drink"]] = "No"
treated_df.loc[treated_df["Children"].isnull(), ["Children"]] = 0


### Verify Missing Value Imputation

In [None]:
nan_df = treated_df.isnull().sum()

if nan_df.sum() != 0:
    raise AssertionError(f"NaN values still exist in the dataset\n{nan_df[nan_df > 0]}")
else:
    print("No remaining missing values")


### Compare Treated and Untreated Data Summaries

In [None]:
fig, axes = plt.subplots(len(missing_cols), 1, figsize=(30, 15))
rows = range(4900, 5100)

for ax, col in zip(axes, missing_cols):
    try:
        _ = data = df[col].loc[rows].plot(title=f"Untreated {col}", color="blue", ax=ax)
        _ = sns.lineplot(
            data=treated_df[col].loc[rows],
            color="red",
            marker="o",
            linestyle="dotted",
            ax=ax,
        )
    except TypeError:
        # display non-numeric data in a bar chart
        _ = (
            treated_df[col]
            .loc[rows]
            .value_counts()
            .plot(
                title=f"Treated {col}",
                kind="bar",
                color="red",
                linestyle="dotted",
                ax=ax,
            )
        )
        _ = df[col].loc[rows].value_counts().plot(color="blue", kind="bar", ax=ax)
        pass

    # compare Treated and Untreated data descriptions
    pd.DataFrame(
        {
            f"Untreated {col}": df[col].describe(),
            f"Treated {col}": treated_df[col].describe(),
        }
    )

fig.tight_layout(pad=0.3)


In [None]:
lib.create_boxplots(treated_df)


### Outlier Treatment

In [None]:
# numerical columns with notable outliers outliers are allowed on columns not listed here
z_score_cols = [
    "Children",
    "VitD_levels",
    "Full_meals_eaten",
    "VitD_supp",
    "TotalCharge",
    "Additional_charges",
]

treated_df["Children"].value_counts()


In [None]:
lib.create_boxplots(treated_df)


#### VitD_levels Outlier Exploration

In [None]:
# pd.set_option('display.max_rows', None)
# determine what correlation exists between Gender and VitD_levels
pd.DataFrame(
    treated_df[treated_df["Gender"] == "Prefer not to answer"][
        ["VitD_levels", "Gender", "Age"]
    ]
    .round()
    .sort_values(ascending=False, by=["VitD_levels", "Age"])
).head(10)

# determine what the average VitD_levels are by Gender and Age
treated_df[["VitD_levels", "Gender", "Age"]].round().groupby(["Gender", "Age"]).agg(
    {"VitD_levels": "mean"}
).sort_values("VitD_levels")

# determine what correlation exists between Gender and VitD_levels
vitd_df = df.copy()
vitd_df["Gender"] = df["Gender"].map(
    {"Prefer not to answer": 0, "Male": 1, "Female": 2}
)
vitd_df.corr()["VitD_levels"].sort_values(ascending=False)


#### VitD_levels Outlier Treatment

In [None]:
# no treatment - retaining existing values


#### Children Outlier Exploration

In [None]:
lib.create_hist_and_boxplots(df, "Children")


#### Children Outlier Treatment

In [None]:
col = "Children"

children_z_df = abs(stats.zscore(df[col], nan_policy="omit"))
pd.DataFrame(df[children_z_df > 3][col].value_counts())

treated_df[col] = treated_df[col].astype("int")

# store maximum number of children that is a nonoutlier
max_num_children = df[children_z_df < 3][col].max()

# set the Children outliers to the max nonoutlier
treated_df.loc[treated_df[col] > max_num_children, col] = max_num_children

assert treated_df[col].max() == max_num_children

lib.create_hist_and_boxplots(df, col)
lib.create_hist_and_boxplots(treated_df, col)


#### Full_meals_eaten Outlier Exploration

In [None]:
lib.create_hist_and_boxplots(df, "Full_meals_eaten")


#### Full_meals_eaten Outlier Treatment

In [None]:
col = "Full_meals_eaten"

meals_z_df = abs(stats.zscore(df[col], nan_policy="omit"))
pd.DataFrame(df[meals_z_df > 3][col].value_counts())

treated_df[col] = treated_df[col].astype("int")

# store maximum number of children that is a nonoutlier
max_num_meals = df[children_z_df < 3][col].max()
med_num_meals = df[col].median()

# set the meals eaten outliers to the median value
treated_df.loc[meals_z_df > 3, col] = med_num_meals

assert treated_df[col].max() <= max_num_meals


In [None]:
lib.create_hist_and_boxplots(df, "Full_meals_eaten")
lib.create_hist_and_boxplots(treated_df, "Full_meals_eaten")


### Re-Expression Categories Treatment

In [None]:
# convert categorical string values to category data type

# convert "Soft_drink" nan's to 'No'
treated_df.loc[df["Soft_drink"].isna(), "Soft_drink"] = "No"

# convert "Overweight" float64's to Yes/No bools
treated_df["Overweight"] = treated_df["Overweight"].map(
    {np.nan: "No", 0: "No", "No": "No", 1: "Yes", "Yes": "Yes"}
)

treated_df["Anxiety"] = treated_df["Anxiety"].map(
    {np.nan: "No", 0: "No", "No": "No", 1: "Yes", "Yes": "Yes"}
)

# validate a value exists for each row
assert len(treated_df["Soft_drink"].isna().index) == len(treated_df.index)

# categorical columns and potential categorical columns
ctg_cols = [
    "Anxiety",
    "Area",
    "Timezone",
    "Employment",
    "Education",
    "Marital",
    "Gender",
    "ReAdmis",
    "Soft_drink",
    "Initial_admin",
    "HighBlood",
    "Stroke",
    "Complication_risk",
    "Overweight",
    "Arthritis",
    "Diabetes",
    "Hyperlipidemia",
    "BackPain",
    "Allergic_rhinitis",
    "Reflux_esophagitis",
    "Asthma",
    "Services",
]

for col in ctg_cols:
    # print(treated_df[col].unique())
    print(f"--- COLUMN: '{col}' ---")
    # sort the unique values and print 1 value per line
    # print(*np.sort(df[col].unique()), sep='\n', end="\n\n")

    # convert column to a category
    treated_df[col] = treated_df[col].astype("category")

    # confirm the values successfully converted
    assert treated_df[col].dtype == "category"

    # report categorical counts
    print(treated_df[col].value_counts())

ctg_cols = treated_df.select_dtypes("category").columns


In [None]:
lib.plot_categorical_counts(treated_df, ctg_cols)


In [None]:
if treated_df.columns[treated_df.dtypes == "category"].any():
    category_df = treated_df.copy()
    encodings, encoded_df, ctg_cols = lib.create_category_encodings(category_df)
    treated_df = encoded_df
else:
    print("categories are already encoded")

# ctg_cols
lib.plot_categorical_counts(
    treated_df,
    [
        "Area",
        "Timezone",
        "Education",
        "Employment",
        "Marital",
        "Gender",
        "ReAdmis",
        "Soft_drink",
        "Initial_admin",
        "HighBlood",
        "Stroke",
        "Complication_risk",
        "Overweight",
        "Arthritis",
        "Diabetes",
        "Hyperlipidemia",
        "BackPain",
        "Anxiety",
        "Allergic_rhinitis",
        "Reflux_esophagitis",
        "Asthma",
        "Services",
    ],
)


### Other Data Cleaning Treatment

#### Drop 'Unnamed: 0' column

In [None]:
# trimwhite space from string columns
str_cols = treated_df.select_dtypes("object")
treated_df[str_cols.columns] = str_cols.apply(lambda s: s.str.strip())


In [None]:
# Verify these columns are the same, if so drop the "Unnamed: 0" column
if "Unnamed: 0" in treated_df.columns and treated_df["Unnamed: 0"].equals(
    treated_df["CaseOrder"]
):
    print("Dropping 'Unnamed: 0' column")
    treated_df = treated_df.drop(columns="Unnamed: 0", axis=1)


#### Round Age and Children Values

In [None]:
treated_df["Age"] = treated_df["Age"].astype("int")
assert treated_df["Age"].dtype == "int"


In [None]:
treated_df["Children"] = treated_df["Children"].astype("int")
assert treated_df["Children"].dtype == "int"


## D2: Justification of Mitigation Methods

## D3: Summary of the Outcomes

In [None]:
_ = treated_df.hist(figsize=(25, 25))


## D4: Mitigation Code 

In [None]:
treated_df.to_csv("clean_data.csv", index=False)
lib.write_encodings_csv(encodings)


In [None]:
# example decode an encoded value by feature name and encoded value
encodings["Complication_risk"].inverse_transform([2])


## D5: Clean Data

## D6: Limitations

## D7: Impact of the Limitations

## E1. Principal Components

In [None]:
pca_df = treated_df.copy()
treated_pca_df = treated_df.copy()
pca_df.describe(exclude="number")
pca_df.shape


### Correlation Heatmap

In [None]:
cmap = sns.diverging_palette(h_neg=10, h_pos=240, as_cmap=True)
fig, ax = plt.subplots(figsize=(25, 10))
corr_df = pca_df.corr()

# mask part of the matrix
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True

_ = sns.heatmap(
    corr_df,
    center=0,
    cmap=cmap,
    annot=True,
    fmt=".1f",
    vmin=-1.0,
    vmax=1.0,
    mask=mask,
    linewidths=0.5,
    ax=ax,
)


In [None]:
# store the string columns in a separate dataframe
str_cols = treated_df.select_dtypes("object")
str_cols.columns

# drop the string columns
treated_pca_df = treated_pca_df.drop(str_cols.columns, axis=1, errors="ignore")
treated_pca_df.shape


In [None]:
treated_pca_df.var().sort_values(ascending=False).head(15)


In [None]:
patient_features = [
    "Additional_charges",
    "Full_meals_eaten",
    "Complication_risk",
    "Children",
    "Education",
    "Doc_visits",
    "Zip",
    "Lng",
    "Lat",
    "Population",
    "Timezone",
    "Services",
    "ReAdmis",
    "Initial_admin",
    "Initial_days",
    "TotalCharge",
    "Age",
    "VitD_levels",
    "HighBlood",
    "Item1",
    "Item2",
    "Item3",
    "Item4",
    "Item5",
    "Item6",
    "Item7",
    "Item8",
]

treated_pca_df[patient_features].var()


In [None]:
# create a dataframe using a subset of numerical features
treated_pca_feats_df = treated_pca_df[patient_features].copy()


In [None]:
from sklearn.feature_selection import VarianceThreshold

vt_df = treated_pca_feats_df.copy()

# Feature selector that removes all low-variance features based on a variance threshold.
sel = VarianceThreshold(threshold=0.75)

sel.fit_transform(vt_df)
# sel.fit_transform(vt_df / vt_df.mean())
# boolean array of shape [# input features], where an element is True if its corresponding feature is selected for retention.
mask = sel.get_support()

# retained columns
retain_cols = vt_df.loc[:, mask].columns

# dropped columns
drop_cols = vt_df.loc[:, ~mask].columns

pd.DataFrame({"retained": retain_cols})
pd.DataFrame({"dropped fields": drop_cols})


#### An optimal number of components

In [None]:
pipe = Pipeline([("scaler", StandardScaler()), ("reducer", PCA())])

pipe.fit(treated_pca_feats_df)
pca = pipe.steps[1][1]
num_components = len(pca.components_)
var = pipe.steps[1][1].explained_variance_ratio_

fig, axs = plt.subplots(1, 1, figsize=(15, 5))
axs.set_title("scree plot graph")
sns.lineplot(data=var, ax=axs)

plt.xlabel("Principal component")
plt.ylabel("Explained variance ratio")
plt.xticks(np.arange(0, num_components, 1))

plt.show()


In [None]:
print(pca.explained_variance_ratio_.cumsum())


#### Loadings Table

In [None]:
# create dataframe from principal components
# correlation coefficients between the features and the principal components
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i}" for i in range(1, num_components + 1)],
    index=treated_pca_feats_df.columns,
)
loadings


## E2. Criteria Used

In [None]:
test_pca_normalized = StandardScaler().fit_transform(treated_pca_feats_df)

cov_matrix = np.dot(
    test_pca_normalized.T, test_pca_normalized / treated_pca_feats_df.shape[0]
)

eigenvalues = [
    np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector))
    for eigenvector in pca.components_
]

plt.figure(figsize=(15, 5))

plt.plot(eigenvalues, linewidth=2)
plt.xticks(np.arange(0, treated_pca_feats_df.shape[1], 1))
plt.axhline(y=1, color="gray", linestyle="--")
plt.xlabel("Principal component index")
plt.ylabel("Explained variance ratio")

treated_pca_feats_df.shape[1]
cs = pca.explained_variance_ratio_.cumsum()
evs = np.array(eigenvalues)
num_pcs_to_use = len(evs[evs >= 1])

# The Kaiser rule: drop all components with eigenvalues < 1.0
print(f"Based on the Kaiser Rule, we'll keep the first {num_pcs_to_use} PCs")


## E3. Benefits

# Part IV. Supporting Documents
---

## F. Video

## G. Web Sources

## H. Sources/References