# Example of merging dfs

In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
# Sample df

patient_df = pd.DataFrame({
    'PatientID' : range(1, 51),
    'BloodPressure': np.random.randint(80, 180, 50),
    'Smoking': np.random.randint(0, 2, 50),
    'WhiteBloodCellCount': np.random.randint(300, 800, 50),
    'CholesterolLevel': np.random.randint(5, 30, 50)
})

BloodPressure_range_df = pd.DataFrame({
    'BloodPressureDiagnosisID': range(3),
    'BloodPressureLow': [0, 121, 141],
    'BloodPressureHigh': [120, 140, 5000]
})

BloodPressureDiagnosis_df = pd.DataFrame({
    'BloodPressureDiagnosisID': range(3),
    'BloodPressureDiagnosis': ['No treatment needed', 'Exercise Daily', 'You\'re screwed!']
})

SmokingDiagnosis_df = pd.DataFrame({
    'Smoking': [0, 1],
    'SmokingDiagnosis': ['Good job!', 'STOP IT NOW DUMMY!']
})

WhiteBloodCell_range_df = pd.DataFrame({
    'WhiteBloodCellDiagnosisID': [0, 1],
    'WhiteBloodCellLow': [0, 501],
    'WhiteBloodCellHigh': [500, 10000]
})

Cholesterol_range_df = pd.DataFrame({
    'CholesterolDiagnosisID': [0, 1],
    'CholesterolLow': [0, 16],
    'CholesterolHigh': [15, 100]
})

CombinedWhiteBloodCellAndCholesterolDiagnosis_df = pd.DataFrame({
    'WhiteBloodCellDiagnosisID': [0, 0, 1, 1],
    'CholesterolDiagnosisID': [0, 1, 0, 1],
    'CombinedWhiteBloodCellAndCholesterolDiagnosis': ['Nice', 'Take more walks', 'Eat more chocolate', 'Strike first, strike hard, no mercy!']
})


In [3]:
patient_df

Unnamed: 0,PatientID,BloodPressure,Smoking,WhiteBloodCellCount,CholesterolLevel
0,1,152,0,659,10
1,2,135,0,660,14
2,3,138,0,481,13
3,4,117,1,361,26
4,5,164,0,469,26
5,6,83,0,652,26
6,7,97,1,394,5
7,8,159,0,330,7
8,9,120,1,386,23
9,10,105,0,634,21


In [4]:
BloodPressure_range_df

Unnamed: 0,BloodPressureDiagnosisID,BloodPressureLow,BloodPressureHigh
0,0,0,120
1,1,121,140
2,2,141,5000


In [5]:
BloodPressureDiagnosis_df

Unnamed: 0,BloodPressureDiagnosisID,BloodPressureDiagnosis
0,0,No treatment needed
1,1,Exercise Daily
2,2,You're screwed!


In [6]:
SmokingDiagnosis_df

Unnamed: 0,Smoking,SmokingDiagnosis
0,0,Good job!
1,1,STOP IT NOW DUMMY!


In [7]:
WhiteBloodCell_range_df

Unnamed: 0,WhiteBloodCellDiagnosisID,WhiteBloodCellLow,WhiteBloodCellHigh
0,0,0,500
1,1,501,10000


In [8]:
Cholesterol_range_df

Unnamed: 0,CholesterolDiagnosisID,CholesterolLow,CholesterolHigh
0,0,0,15
1,1,16,100


In [9]:
CombinedWhiteBloodCellAndCholesterolDiagnosis_df

Unnamed: 0,WhiteBloodCellDiagnosisID,CholesterolDiagnosisID,CombinedWhiteBloodCellAndCholesterolDiagnosis
0,0,0,Nice
1,0,1,Take more walks
2,1,0,Eat more chocolate
3,1,1,"Strike first, strike hard, no mercy!"


In [10]:
# There's no good way in pandas to join on a range of values, so easiest to use sql for any part involving a range. The rest can be done easily with merges.

conn = sqlite3.connect(':memory:')
patient_df.to_sql("patient_df", conn, index=False)
BloodPressure_range_df.to_sql("BloodPressure_range_df", conn, index=False)
WhiteBloodCell_range_df.to_sql("WhiteBloodCell_range_df", conn, index=False)
Cholesterol_range_df.to_sql("Cholesterol_range_df", conn, index=False)
query = """

SELECT patient_df.*, BloodPressure_range_df.BloodPressureDiagnosisID, WhiteBloodCell_range_df.WhiteBloodCellDiagnosisID, Cholesterol_range_df.CholesterolDiagnosisID
FROM patient_df 
LEFT JOIN BloodPressure_range_df ON patient_df.BloodPressure BETWEEN BloodPressure_range_df.BloodPressureLow AND BloodPressure_range_df.BloodPressureHigh
LEFT JOIN WhiteBloodCell_range_df ON patient_df.WhiteBloodCellCount BETWEEN WhiteBloodCell_range_df.WhiteBloodCellLow AND WhiteBloodCell_range_df.WhiteBloodCellHigh
LEFT JOIN Cholesterol_range_df ON patient_df.CholesterolLevel BETWEEN Cholesterol_range_df.CholesterolLow AND Cholesterol_range_df.CholesterolHigh
"""

new_patient_df = pd.read_sql_query(query, conn)

new_patient_df

Unnamed: 0,PatientID,BloodPressure,Smoking,WhiteBloodCellCount,CholesterolLevel,BloodPressureDiagnosisID,WhiteBloodCellDiagnosisID,CholesterolDiagnosisID
0,1,152,0,659,10,2,1,0
1,2,135,0,660,14,1,1,0
2,3,138,0,481,13,1,0,0
3,4,117,1,361,26,0,0,1
4,5,164,0,469,26,2,0,1
5,6,83,0,652,26,0,1,1
6,7,97,1,394,5,0,0,0
7,8,159,0,330,7,2,0,0
8,9,120,1,386,23,0,0,1
9,10,105,0,634,21,0,1,1


In [11]:
new_patient_df.merge(BloodPressureDiagnosis_df, how = 'left', on = ['BloodPressureDiagnosisID']).merge(SmokingDiagnosis_df, how = 'left', on = ['Smoking']).merge(CombinedWhiteBloodCellAndCholesterolDiagnosis_df, how = 'left', on = ['WhiteBloodCellDiagnosisID', 'CholesterolDiagnosisID'])

Unnamed: 0,PatientID,BloodPressure,Smoking,WhiteBloodCellCount,CholesterolLevel,BloodPressureDiagnosisID,WhiteBloodCellDiagnosisID,CholesterolDiagnosisID,BloodPressureDiagnosis,SmokingDiagnosis,CombinedWhiteBloodCellAndCholesterolDiagnosis
0,1,152,0,659,10,2,1,0,You're screwed!,Good job!,Eat more chocolate
1,2,135,0,660,14,1,1,0,Exercise Daily,Good job!,Eat more chocolate
2,3,138,0,481,13,1,0,0,Exercise Daily,Good job!,Nice
3,4,117,1,361,26,0,0,1,No treatment needed,STOP IT NOW DUMMY!,Take more walks
4,5,164,0,469,26,2,0,1,You're screwed!,Good job!,Take more walks
5,6,83,0,652,26,0,1,1,No treatment needed,Good job!,"Strike first, strike hard, no mercy!"
6,7,97,1,394,5,0,0,0,No treatment needed,STOP IT NOW DUMMY!,Nice
7,8,159,0,330,7,2,0,0,You're screwed!,Good job!,Nice
8,9,120,1,386,23,0,0,1,No treatment needed,STOP IT NOW DUMMY!,Take more walks
9,10,105,0,634,21,0,1,1,No treatment needed,Good job!,"Strike first, strike hard, no mercy!"
