# Course: Data Engineering
# **Practical Exercise: Implementing ETL Using Python for a Healthcare Application**
# Prepared by: Georges Assaf



<a href="https://colab.research.google.com/github/gassaf2/DataEngineering/blob/main/week3/Practical Exercise ImplementingETL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1) Extract Data

In [21]:
import pandas as pd

# Extract data from the CSV file
patients_df = pd.read_csv('./sample_data/patients.csv')
print("Extracted Patient Data:")
print(patients_df)

Extracted Patient Data:
    patient_id             name  age  gender
0         P001      James Smith   45    Male
1         P002     Mary Johnson   32  Female
2         P003  Robert Williams   56    Male
3         P004   Patricia Brown   29  Female
4         P005       John Jones   67    Male
..         ...              ...  ...     ...
195       P196     Emily Brooks   41  Female
196       P197      Jack Fisher   29    Male
197       P198       Judith Lee   50  Female
198       P199       Sean Kelly   38    Male
199       P200  Rebecca Sanders   57  Female

[200 rows x 4 columns]


In [22]:
# Simulated API response for diagnostic data
diagnostic_data=[
{"diagnostic_id":"D001","patient_id":"P001","test":"Blood Test","result":"Normal"},

{"diagnostic_id":"D002","patient_id":"P002","test":"X−Ray","result":"Fracture"},
{"diagnostic_id":"D003","patient_id":"P003","test":"MRI","result":"Normal"}
]
print("Extracted Diagnostic Data:")
print(diagnostic_data)

Extracted Diagnostic Data:
[{'diagnostic_id': 'D001', 'patient_id': 'P001', 'test': 'Blood Test', 'result': 'Normal'}, {'diagnostic_id': 'D002', 'patient_id': 'P002', 'test': 'X−Ray', 'result': 'Fracture'}, {'diagnostic_id': 'D003', 'patient_id': 'P003', 'test': 'MRI', 'result': 'Normal'}]


In [25]:
# Convert sales_data to a DataFrame
diagnostic_df = pd.DataFrame(diagnostic_data)
diagnostic_df

Unnamed: 0,diagnostic_id,patient_id,test,result
0,D001,P001,Blood Test,Normal
1,D002,P002,X−Ray,Fracture
2,D003,P003,MRI,Normal


# 2) Transform the data

Clean patient data: Let’s assume you need to filter out patients who are younger than 40
years old for a specific study.

In [28]:
#extracting only the patient with age > 40 years
transformed_patients_df = patients_df[patients_df["age"] <= 40]
transformed_patients_df

Unnamed: 0,patient_id,name,age,gender
1,P002,Mary Johnson,32,Female
3,P004,Patricia Brown,29,Female
5,P006,Linda Garcia,40,Female
6,P007,Michael Miller,23,Male
8,P009,William Rodriguez,38,Male
...,...,...,...,...
187,P188,Anna Bryant,30,Female
190,P191,Patrick Russell,37,Male
192,P193,Dennis Hamilton,33,Male
196,P197,Jack Fisher,29,Male


<br>Enrich diagnostic data with patient information: Join the diagnostics data with
patient details (name, age, gender) to provide context for the test results

In [29]:
# Merge dataframes on 'patient_id'
transformed_diagnostic_df= pd.merge(diagnostic_df, patients_df[['patient_id','name','age','gender']], on="patient_id", how="left")
transformed_diagnostic_df

Unnamed: 0,diagnostic_id,patient_id,test,result,name,age,gender
0,D001,P001,Blood Test,Normal,James Smith,45,Male
1,D002,P002,X−Ray,Fracture,Mary Johnson,32,Female
2,D003,P003,MRI,Normal,Robert Williams,56,Male


# 3)Load Data into MongoDB

Creating the connection to MongoDB

In [30]:
from pymongo import MongoClient
from datetime import datetime

#connection_string="mongodb+srv://gassaf2:dbUserPassword@products.g02gx.mongodb.net/?retryWrites=true&w=majority&appName=products"
connection_string="mongodb+srv://gassaf2:dbUserPassword@cluster0.xjx2q.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
# Connect to the MongoDB Atlas cluster
client = MongoClient(connection_string)

# Access a specific database
db = client['health_db']

<br><br>Inserting the patients data into MongoDB  

In [31]:
# Access a the collection patients within the database
patients = db['patients']

#Load the data in patients

# Convert DataFrame to dictionary format
patients_dict = patients_df.to_dict(orient="records")

# Insert the patients data into MongoDB
patients.insert_many(patients_dict)

InsertManyResult([ObjectId('67a08e4f1c9e3529c1342725'), ObjectId('67a08e4f1c9e3529c1342726'), ObjectId('67a08e4f1c9e3529c1342727'), ObjectId('67a08e4f1c9e3529c1342728'), ObjectId('67a08e4f1c9e3529c1342729'), ObjectId('67a08e4f1c9e3529c134272a'), ObjectId('67a08e4f1c9e3529c134272b'), ObjectId('67a08e4f1c9e3529c134272c'), ObjectId('67a08e4f1c9e3529c134272d'), ObjectId('67a08e4f1c9e3529c134272e'), ObjectId('67a08e4f1c9e3529c134272f'), ObjectId('67a08e4f1c9e3529c1342730'), ObjectId('67a08e4f1c9e3529c1342731'), ObjectId('67a08e4f1c9e3529c1342732'), ObjectId('67a08e4f1c9e3529c1342733'), ObjectId('67a08e4f1c9e3529c1342734'), ObjectId('67a08e4f1c9e3529c1342735'), ObjectId('67a08e4f1c9e3529c1342736'), ObjectId('67a08e4f1c9e3529c1342737'), ObjectId('67a08e4f1c9e3529c1342738'), ObjectId('67a08e4f1c9e3529c1342739'), ObjectId('67a08e4f1c9e3529c134273a'), ObjectId('67a08e4f1c9e3529c134273b'), ObjectId('67a08e4f1c9e3529c134273c'), ObjectId('67a08e4f1c9e3529c134273d'), ObjectId('67a08e4f1c9e3529c13427

<br><br>Inserting the diagnostic data into MongoDB

In [32]:
# Access a the collection patients within the database
diagnostic = db['diagnostic']

#Load the data in patients

# Convert DataFrame to dictionary format
diagnostic_dict = diagnostic_df.to_dict(orient="records")

# Insert the patients data into MongoDB
diagnostic.insert_many(diagnostic_dict)

InsertManyResult([ObjectId('67a08e981c9e3529c13427ed'), ObjectId('67a08e981c9e3529c13427ee'), ObjectId('67a08e981c9e3529c13427ef')], acknowledged=True)

# 4) Automating the ETL Process

In [42]:
def extract_patients():
    return pd.read_csv('./sample_data/patients.csv')

In [43]:
def extract_diagnostic():
    return pd.DataFrame(diagnostic_data)  

In [44]:
def transform_patients(patients_df):
    patients_df = patients_df[patients_df["age"] <= 40]
    return patients_df    

In [45]:
def transform_diagnostic(transformed_patients_df,diagnotic_df):
    return pd.merge(diagnostic_df, patients_df[['patient_id','name','age','gender']], on="patient_id", how="left")

In [46]:
def load_data(patients_df,diagnostic_df):
    db.patients.insert_many(patients_df.to_dict(orient='records'))
    db.diagnostic.insert_many(diagnostic_df.to_dict(orient='records'))                            

In [48]:
#Run the ETL process
patients_df=extract_patients()
diagnostic_df=extract_diagnostic()
transformed_patients_df=transform_patients(patients_df)
transformed_diagnostic_df=transform_diagnostic(patients_df,diagnostic_df)
load_data(transformed_patients_df,transformed_diagnostic_df)
print("ETL Process Completed")

ETL Process Completed
