In [None]:
!pip install missingno

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from IPython.core.interactiveshell import InteractiveShell
from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA    

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# display scientific notation as a float
pd.set_option('display.float_format', lambda x: '%.3f' % x)

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="white", rc=custom_params, palette="Set2")

encodings = {}

In [2]:
from data_cleaning_lib import (
    create_boxplots,
    create_hist_and_boxplots,
    plot_categorical_counts,
    create_category_encodings,
    write_encodings_csv,
    create_na_mask,
)

# Part 0: Data Exploration

In [None]:
filename = "./src/Medical Data/medical_raw_data.csv"
df = pd.read_csv(filename)

In [None]:
df.dtypes
print(df.shape, end="\n\n")
print(df.columns, end="\n\n")
print(df.index, end="\n\n")
df.info()

In [None]:
df.head() 

In [None]:
df.describe()

### Missing Values

In [None]:
(df.isna().sum()/len(df) * 100).sort_values(ascending=False)

# Part I: Research Question
---

## A. Question or Decision ✅

**Describe one question or decision that you will address using the data set you chose. The summarized question or decision must be relevant to a realistic organizational need or situation.**

To understand where to focus readmission reduction efforts for the Acme Hospital Chain. The Analyst team is tasked with researching what relationship(s) exists between readmitted patients and their Geographical Area type (rural, urban, suburban)?

## B. Required Variables ✅
Describe the variables in the data set and indicate the specific type of data being described. Use examples from the data set that support your claims.**

In [None]:
# merges datatypes with sample data
# see "D206 Data Cleaning_ Medical Data Considerations and Dictionary" for feature descriptions
df_tp = df.head(1).transpose()

# store the value's python data type
df_tp["Python Data Type"] = df_tp[0].apply(type)

# store the value's pandas data type
df_tp["Pandas Data Type"] = df.dtypes

# rename cols
df_tp = df_tp.reset_index().rename(columns={'index':'Feature', 0: "Sample Value"})

# reorder columns
df_tp.reindex(columns=["Feature", "Pandas Data Type", "Python Data Type", "Sample Value"])

# Part II: Data-Cleaning Plan
---

Explain the plan for cleaning the data by doing the following:
 1. Examine the entire dataset (regardless of your research question).
 2. Detection and Treatment of Duplicates, Missing Values and Outlier (check for outliers for all numeric variables).
 3. Re express Categorical Variables, if possible (this is optional, but suggested)
 4. Perform PCA (with numerical variables only)

## C1: Plan to Find Anomalies
Propose a plan that includes the relevant techniques and specific steps needed to identify anomalies in the data set.

### Understanding the dataset

In [None]:
df.shape
df.info()

In [None]:
display(df.columns)
display(df.describe())
df.head()

In [None]:
pd.DataFrame(df.isnull().sum().sort_values(ascending=False))

### Detect Duplicates

In [None]:
df[df.duplicated()]

In [None]:
df[df.duplicated(subset=['Customer_id', 'Interaction', 'UID'])]

In [None]:
# determine if 'Unnamed: 0' and 'CaseOrder' columns have the same values
(df['Unnamed: 0'] == df['CaseOrder']).all()

### Detect Missing Values

In [None]:
# get column names for any columns with missing values
missing_cols = df.columns[df.isnull().any()]

# create a dataframe of columns with missing values
missing_df = df[missing_cols]
pct_missing_df = pd.DataFrame()

# calculate % missing for columns with missing data
pct_missing_df['missing cnt'] = missing_df.isnull().sum()
pct_missing_df['missing %'] = ((pct_missing_df['missing cnt'] / len(df.index)) * 100)

# sort % missing in descending order
pct_missing_df.sort_values(by='missing cnt', ascending=False, axis=0)

#### Missing Matrix

In [None]:
# visualize the dataframe's nullity
_ = msno.matrix(missing_df, figsize=(35, 10)) 

#### Missing Bar Chart

In [None]:
# bar chart visualization of the dataframe's nullity
_ = msno.bar(df, figsize=(25,20), color="gray")

#### Missing Heatmap

In [None]:
# heatmap visualization of nullity correlation within the DataFrame (displaying 250 records).
# -1 indicates if one variable appears then the other variable is very likely to be missing.
# 0 indicates there is no dependence between the occurrence of missing values of two variables.
# 1 indicates that when one variable appears then the other variable is likely to be present.
_ = msno.heatmap(missing_df.loc[:250], figsize=(25,10))

#### Missing Dendrogram

In [None]:
# shows the hierarchical relationship between objects
_ = msno.dendrogram(missing_df, orientation='bottom', figsize=(25,5))

### Detect Outliers

#### Z-Scores
find columns with addressable outliers

In [None]:
def get_zscore_cols(df):
    numeric_df = df.select_dtypes(include=['int64', 'float64']).apply(pd.to_numeric)
    z_score_df = np.abs(stats.zscore(numeric_df, nan_policy='omit')) > 3
    
    col_outlier_counts = z_score_df.sum()
    # create a dataframe using columns with at least 1 outlier     
    outlier_counts_df = col_outlier_counts[col_outlier_counts > 0]

    # outlier column names     
    return outlier_counts_df.index

# numerical columns with notable outliers that we're choosing to ignore
ignored_outlier_cols = ['Lat', 'Lng', 'Population', 'Age', 'Income', 'TotalCharge', 'Additional_charges', 'Doc_visits']

# store the relevant zscore columns that are important to address
z_score_cols = list(set(get_zscore_cols(df)) - set(ignored_outlier_cols))
# columns 
z_score_cols

#### store z-score values for addressable columns

In [None]:
zscore_df = df[z_score_cols].copy()

for col in z_score_cols:
    zscore_df[f"{col}_zscore"] = stats.zscore(df[col], nan_policy='omit')
    # store values where their absolute values exceeds a z-score of > 3
    zscore_df.loc[abs(zscore_df[f"{col}_zscore"]) > 3, [col, f"{col}_zscore"]]
    
# display zscore values    
zscore_df.iloc[:, zscore_df.columns.str.contains("zscore")]

#### Boxplots

In [None]:
# create_boxplots(df.loc[:, ~df.columns.isin(['Unnamed: 0', 'CaseOrder'])])        

# display boxplots with known and relevant outliers
create_boxplots(df[z_score_cols])

#### Histograms

In [None]:
create_histograms(df.loc[:, ~df.columns.isin(['Unnamed: 0', 'CaseOrder'])])

In [None]:
_ = df.hist(layout=(6,5), figsize = (25,15))

### Re-Expression Categories
convert categorical string values to category data type

In [None]:
# convert "Soft_drink" nan's to 'No'
df.loc[df["Soft_drink"].isna(), "Soft_drink"] = "No"

# convert "Overweight" float64's to Yes/No bools
df["Overweight"] = df["Overweight"].map({np.nan: "No", 0: "No", 1: "Yes"})

# validate a soft drink value exists for each record
assert len(df["Soft_drink"].isna().index) == len(df.index)

# categorical columns and potential categorical columns
ctg_cols = [
    "Allergic_rhinitis",
    "Area",
    "Arthritis",
    "Asthma",
    "BackPain",
    "Complication_risk",
    "Diabetes",
    "Education",
    "Employment",
    "Gender",
    "HighBlood",
    "Hyperlipidemia",
    "Initial_admin",
    "Marital",
    "Overweight",
    "ReAdmis",
    "Reflux_esophagitis",
    "Services",
    "Soft_drink",
    "Stroke",
    "Timezone",
]

for col in ctg_cols:
    # print(df[col].unique())
    print(f"--- CONVERTING COLUMN: '{col}' to category ---")

    # convert column to a category
    df = df.astype({col: "category"})

    # confirm the values successfully converted
    assert df[col].dtype == "category"

    # report categorical counts
    print(df[col].value_counts(), end="\n\n")

### Cleaning text data



In [None]:
str_cols = df.select_dtypes('object')
df[str_cols.columns] = str_cols.apply(lambda s: s.str.strip())

# validate all state abbreviations are 2 characters 
assert (df['State'].str.len() == 2).all()

# validate zip codes are 3, 4 or 5 digits
assert df['Zip'].astype('str').str.len().between(3,5).all()

# cross field validation: validate 3 digit zip codes are Puerto Rican addresses
assert (df.loc[df['Zip'].astype('str').str.len() == 3, ['State', 'Zip']]["State"] == 'PR').all()

# Customer_id, Interaction, UID, State,  and Zip 

# validate all Customer_id values are 6 or 7 characters 
pd.DataFrame(df["Customer_id"].str.len()).value_counts()
assert df['Customer_id'].str.len().between(6,7).all()

# validate all Interaction values are 36 characters 
pd.DataFrame(df["Interaction"].str.len()).value_counts()
assert (df['Interaction'].str.len() == 36).all()

# validate all Interaction values are 32 characters 
pd.DataFrame(df["UID"].str.len()).value_counts()
assert (df['UID'].str.len() == 32).all()

### Other Data Cleaning

In [None]:
# Verify these columns are the same, if so drop the "Unnamed: 0" column
# drop "Unnamed: 0" if dupes
assert df["Unnamed: 0"].ne(df["CaseOrder"]).all()

In [None]:
# cast to int if this fails
assert df['Age'].dtype == 'int'

In [None]:
# cast to int if this fails
assert df['Children'].dtype == 'int'

## C2: Justification of Approach

## C3: Justification of Tools