In [2]:
import pandas as pd

In [12]:
# import the cleaned datasets
df_clinic = pd.read_csv("./Data Sets/clinic_cleaned.csv")
print(len(df_clinic))
df_household = pd.read_csv("./Data Sets/household_cleaned.csv")
print(len(df_household))
# Convert the date columns to datetime format
df_clinic['enrollment_date'] = pd.to_datetime(df_clinic['enrollment_date'], format='%m/%d/%Y')
df_clinic['discharge_date'] = pd.to_datetime(df_clinic['discharge_date'], format='%m/%d/%Y')
df_household['todate'] = pd.to_datetime(df_household['todate'], format='%m/%d/%Y')

6888
1040


# 1. Summary Tables

## a. Total number of male and female patients, by clinic 

In [13]:
df_clinic.groupby(['site', 'sex']).size().unstack()

sex,f,m
site,Unnamed: 1_level_1,Unnamed: 2_level_1
bani_gaye,564,541
birin_fulani,989,1034
dukku,859,816
galdamari,225,243
kwami,823,794


## b. Average absolute weight gain during the treatment program, by clinic

In [14]:
df_clinic['abs_weight_gain'] = df_clinic['discharge_weight']-df_clinic['enrollment_weight']

In [15]:
df_clinic.groupby('site')['abs_weight_gain'].mean().reset_index().set_index('site')

Unnamed: 0_level_0,abs_weight_gain
site,Unnamed: 1_level_1
bani_gaye,1.520182
birin_fulani,1.934587
dukku,1.672679
galdamari,1.746255
kwami,1.891685


In [16]:
# calculate the average absolute weight gain for only the dataset that has positive value of enrollment weight and discharge weight.
df_clinic_positive_gain = df_clinic[df_clinic['enrollment_weight']<df_clinic['discharge_weight']]
print(len(df_clinic_positive_gain))                                 
df_clinic_positive_gain.groupby('site')['abs_weight_gain'].mean().reset_index().set_index('site')

6750


Unnamed: 0_level_0,abs_weight_gain
site,Unnamed: 1_level_1
bani_gaye,1.576016
birin_fulani,2.013291
dukku,1.739781
galdamari,1.785086
kwami,1.964736


## c. Average relative weight gain, measured in grams per kilogram per day (g/kg/day), by clinic

In [17]:
# Calculate the days difference
df_clinic['days_difference'] = (df_clinic['discharge_date'] - df_clinic['enrollment_date']).dt.days

In [18]:
df_clinic['rel_weight_gain'] = df_clinic['abs_weight_gain']*1000/df_clinic['enrollment_weight']/df_clinic['days_difference']

In [19]:
df_clinic.groupby('site')['rel_weight_gain'].mean().reset_index().set_index('site')

Unnamed: 0_level_0,rel_weight_gain
site,Unnamed: 1_level_1
bani_gaye,5.310804
birin_fulani,6.461895
dukku,6.96656
galdamari,6.385592
kwami,7.768612


# 2. Discharge Criteria

## a. At each clinic, how many patients were erroneously discharged (i.e., they were discharged even though they did not meet the criterion specified above)? 

In [20]:
# Create the indicator for erroneously discharged patients
# Children who are >6 months should NOT be discharged from the treatment program if their weight is <4 kg. 
df_clinic['error_discharged'] = ((df_clinic['discharge_age'] > 6) & (df_clinic['discharge_weight'] < 4)).astype(int)

## b. Write a brief plan (5-10 bullet points) outlining a plan to reduce the number of erroneous discharges in 2025. 

- Identify the causes of erroneous discharges using the existing dataset.
    - For instance, we can compare the proportion of erroneous discharges across different sites, child sex, and the number of staff.
    - From the given dataset, 'Galdamari' turns out to be the site with the lowest erroneous discharge rate, so we can investigate which factors contribute to this and try to replicate them elsewhere.
- Determine if the erroneous discharges occur because the caregiver no longer wants the follow-up treatment.
    - If this is the case, we should investigate financial or cultural factors that might lead caregivers to make this decision.
- Improve the staff training system and ensure they recognize the discharge criteria.
- Digitalize the system to ensure that only children who meet specific criteria can be discharged. This will help staff easily identify children eligible for discharge.
- Ensure that more than two staff members cross-check the data before discharging any child.

# 3. Comparing data

## a. How many discrepancies occurred in August, September, and October at each clinic? 

In [21]:
# Merge df_clinic to df_household based on the common column 'pid'
df_merged = pd.merge(df_household, df_clinic[['pid', 'discharge_date', 'discharge_weight', 'staffmember']], 
                     on='pid', how='inner')

In [22]:
# A discrepancy in weight is defined as a difference in weight >1.0 kg
# for two measurements occurring within seven days of each other.
df_merged['weight_difference'] = abs(df_merged['discharge_weight']-df_merged['weight'])
df_merged['days_from_discharge'] = (df_merged['todate'] - df_merged['discharge_date']).dt.days
df_merged['discrepancy'] = ((df_merged['weight_difference']> 1) & (df_merged['days_from_discharge'] <=7)).astype(int)

In [23]:
# Create a month varibale
df_merged['month'] = df_merged['todate'].dt.month

In [24]:
df_merged[df_merged['month'].isin([8,9,10])].groupby(['site','month'])['discrepancy'].sum().unstack()


month,8,9,10
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bani_gaye,1,5,3
birin_fulani,15,13,6
dukku,8,3,2
galdamari,1,0,0
kwami,10,5,1


## b. Rank the clinic staff members by number of discrepancies. 

In [25]:
df_merged.groupby('staffmember')['discrepancy'].sum().reset_index().sort_values(by='discrepancy', ascending=False)

Unnamed: 0,staffmember,discrepancy
3,5,20
4,6,8
5,7,8
8,14,8
0,1,6
1,2,5
11,23,5
7,10,4
9,17,4
10,20,4


## c. Write a brief plan (5-10 bullet points) outlining a plan to identify and address the cause(s) of discrepancies. 

1. Plan to Identify the Causes
- Perform a comprehensive review of the discrepancies, focusing on how each variable is associated with the number of discrepancies (e.g., number of staff members, sites, sex of the child).
- Validate the data for any errors or inconsistencies in entry that may be contributing to discrepancies (e.g., typos, incorrect values).
- Add variables when conducting new surveys to identify the suspected causes of discrepancies, especially related to staff performance (e.g., duration of training received by each staff member, test scores representing how well each staff member understands the survey process).
2. Plan to Address the Causes
- Implement a double-check system for data entry, where two staff members independently verify patient information before finalizing it.
- Integrate automated checks and validation rules into the data entry system.
- Provide regular training sessions to ensure that all staff members are up-to-date with the latest protocols and data entry procedures.