# HealthCare Dataset Explorer

## Imports

In [1]:
import pandas as pd
from IPython.display import display, Markdown
from decimal import Decimal
from datetime import datetime
from bson.decimal128 import Decimal128

## Functions

In [2]:
def dmd(text):
    display(Markdown(text))


def process_mask(df,mask,column,replace=False):
    rows_to_log = df[mask]
    if len(rows_to_log) :
        rows_to_log = rows_to_log.to_string()
        if replace != False:
            df[mask][column] = replace
            txt = f"Replace {column} in column having incorrect value by {replace}  \n{rows_to_log}  "
        else:
            df.drop(df[mask].index, inplace=True)
            txt = f"Exclude lines for {column} not correct  \n{rows_to_log}  "
            dmd(txt)
    else:
        dmd(f"Nothing wrong with {column}")
def convert_to_int(val, dft= None):
    try:
        return int(val)
    except (ValueError, TypeError):
        print(f"Erreur de conversion:entière {val}")
        return dft  

def convert_to_float(val, dft=None):
    try:
        return float(val)  # conversion compatible MongoDB
    except (ValueError, TypeError) as e:
        print(f"Erreur de conversion décimale: {val} ({e})")
        return dft
    
def convert_to_date(val, format = '%Y-%m-%d' , dft= None):
    try:
        return datetime.strptime(val, format)
    except (ValueError, TypeError):
        print(f"Erreur de conversion date: {val}")
        return dft 

## Extract data

In [3]:
df_hc = pd.read_csv("../data/healthcare_dataset.csv", dtype= str)

In [7]:
df_hc.iloc[1:5,:].to_csv("../tests/dataset1.csv")

In [8]:
print(df_hc.iloc[1:5,:].to_dict())

{'Name': {1: 'LesLie TErRy', 2: 'DaNnY sMitH', 3: 'andrEw waTtS', 4: 'adrIENNE bEll'}, 'Age': {1: '62', 2: '76', 3: '28', 4: '43'}, 'Gender': {1: 'Male', 2: 'Female', 3: 'Female', 4: 'Female'}, 'Blood Type': {1: 'A+', 2: 'A-', 3: 'O+', 4: 'AB+'}, 'Medical Condition': {1: 'Obesity', 2: 'Obesity', 3: 'Diabetes', 4: 'Cancer'}, 'Date of Admission': {1: '2019-08-20', 2: '2022-09-22', 3: '2020-11-18', 4: '2022-09-19'}, 'Doctor': {1: 'Samantha Davies', 2: 'Tiffany Mitchell', 3: 'Kevin Wells', 4: 'Kathleen Hanna'}, 'Hospital': {1: 'Kim Inc', 2: 'Cook PLC', 3: 'Hernandez Rogers and Vang,', 4: 'White-White'}, 'Insurance Provider': {1: 'Medicare', 2: 'Aetna', 3: 'Medicare', 4: 'Aetna'}, 'Billing Amount': {1: '33643.327286577885', 2: '27955.096078842456', 3: '37909.78240987528', 4: '14238.317813937623'}, 'Room Number': {1: '265', 2: '205', 3: '450', 4: '458'}, 'Admission Type': {1: 'Emergency', 2: 'Emergency', 3: 'Elective', 4: 'Urgent'}, 'Discharge Date': {1: '2019-08-26', 2: '2022-10-07', 3: '20

In [None]:
df_hc.describe(include='all')

In [None]:
dmd("  \n".join(df_hc.columns.to_list()))

## Transform data
### Name

In [None]:
col = 'Name'
mask = df_hc[col].isna()
process_mask(df_hc,mask,col)

### Age

In [None]:
col = 'Age'
df_hc[col] = df_hc[col].apply(convert_to_int)
mask = (df_hc[col] < 0) | (df_hc[col]>120)
process_mask(df_hc,mask,col,replace=None)

### Gender

In [None]:
col = 'Gender'
mask = ~df_hc[col].isin(["Male", "Female"])
process_mask(df_hc,mask,col, replace="Other")

### Blood Type

In [None]:
col = 'Blood Type'
possible_types= ["A+","A-", "AB+", "AB-", "B+", "B-", "O+", "O-"]
mask = ~df_hc[col].isin(possible_types)
process_mask(df_hc,mask,col, replace="NA")

### Date of Admission

In [None]:
col = 'Date of Admission'
df_hc[col] = df_hc[col].apply(convert_to_date)
mask = df_hc[col].isna()
process_mask(df_hc,mask,col)

### Doctor

In [None]:
col = "Doctor"
mask = df_hc[col].isna()
process_mask(df_hc,mask,col,replace="NA")

### Hospital

In [None]:
col = "Hospital"
mask = df_hc[col].isna()
process_mask(df_hc,mask,col,replace="NA")

### Room Number

In [None]:
col = "Room Number"
df_hc[col] = df_hc[col].apply(convert_to_int)
mask = df_hc[col].isna()
process_mask(df_hc,mask,col,replace=0)

### Admission Type

In [None]:
col = "Admission Type"
mask = df_hc[col].isna()
process_mask(df_hc,mask,col)

### Discharge Date

In [None]:
col = 'Discharge Date'
df_hc[col] = df_hc[col].apply(convert_to_date)
mask = df_hc[col].isna()
process_mask(df_hc,mask,col,replace=None)

### Insurance Provider

In [None]:
col = "Insurance Provider"
mask = df_hc[col].isna()
process_mask(df_hc,mask,col, replace="NA")

### Billing Amount

In [None]:
col = "Billing Amount"
df_hc[col] = df_hc[col].apply(convert_to_float).round(2)
mask = df_hc[col].isna()
process_mask(df_hc,mask,col)

### Medical Condition

In [None]:
col = "Medical Condition"
mask = df_hc[col].isna()
process_mask(df_hc,mask,col)

### Medication

In [None]:
col = "Medication"
mask = df_hc[col].isna()
process_mask(df_hc,mask,col, replace="NA")

### Test Results

In [None]:
col = "Test Results"
mask = df_hc[col].isna()
process_mask(df_hc,mask,col)

## Unduplicate process

### Keep only last duplicated row

In [19]:
unic_subset = ['Name', 'Gender' , 'Date of Admission','Hospital','Doctor','Medical Condition']
mask = df_hc.duplicated(subset=unic_subset, keep='last')
df_hc[mask].sort_values('Name')

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
17083,AARon smITh,79,Male,A-,Cancer,2019-11-21,Gina Jacobs,Weber-Warren,Medicare,21879.766347982884,155,Emergency,2019-12-20,Ibuprofen,Inconclusive
3040,AAron ArCHER,47,Female,B-,Cancer,2021-01-10,Cynthia Villanueva,"Montes Case and Mendez,",Medicare,10602.077185418817,108,Urgent,2021-01-17,Paracetamol,Inconclusive
40305,ABIGAiL wateRS,34,Female,O+,Asthma,2023-11-13,Sandra Boyer,"Rogers Martin Prince, and",UnitedHealthcare,3121.1961382341997,421,Elective,2023-11-22,Aspirin,Inconclusive
2018,ABIgAIL tucKeR,66,Male,B+,Cancer,2020-08-10,Christopher Avery,Perry and Sons,Blue Cross,1773.4557157416407,480,Elective,2020-08-26,Paracetamol,Normal
42407,ABIgaIL YOung,41,Female,O+,Hypertension,2022-12-15,Edward Kramer,Moore-Mcdaniel,UnitedHealthcare,1983.5682967188222,192,Elective,2023-01-13,Ibuprofen,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22540,zacHARy WiLLIams,70,Female,O+,Arthritis,2024-01-16,Carol Shepard,Chen-Munoz,Cigna,36948.486063085526,118,Emergency,2024-02-15,Aspirin,Abnormal
27622,zacHary ThompSoN,31,Female,O+,Asthma,2019-08-22,Elizabeth Taylor,"Carter Stephens Johnson, and",Cigna,32574.144763009994,229,Emergency,2019-09-10,Aspirin,Abnormal
43838,zacHary fLOrEs,65,Male,AB-,Cancer,2020-02-06,Zachary Hogan,Santos-Fletcher,Blue Cross,20296.118846418543,163,Elective,2020-02-19,Penicillin,Inconclusive
15565,zachAry Brown,69,Female,A+,Obesity,2023-10-13,Rebecca Martin,PLC Garcia,UnitedHealthcare,26421.875105241168,394,Emergency,2023-11-10,Aspirin,Inconclusive


In [None]:
df = df_hc[~mask].sort_values(['Name'])
df

# MongoDB Tests

## Function

In [None]:
import pymongo
mongo_test = pymongo.MongoClient("mongodb://localhost:27017/")
db_test = mongo_test.test
map = {
    'patient' : ['Name', 'Age', 'Gender', "Blood Type"],
    'admission' : ['Date of Admission', 'Doctor', 'Hospital', 'Room Number', 'Admission Type', 'Discharge Date'],
    'billing' : ['Insurance Provider', 'Billing Amount'],
    'care'    : ['Medical Condition', 'Medication', 'Test Results']
    } 
def inject_row(row_dict,db=None):
    doc={}
    for subdoc,fields in map.items():
        fields_doc ={}
        for field in fields:
            fields_doc[field] = row_dict[field]
        doc[subdoc]= fields_doc
    #print(doc)        
    db_test.healthcare.insert_one(doc)


In [None]:
for i,row_dict in df_hc.iterrows():
    #if i > 10 : break
    inject_row(row_dict.to_dict())