In [18]:
import os
import pandas as pd
import pytz
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
from collections import defaultdict
from django.apps import apps as django_apps
from django.db.models import F, Max, Case, When, Sum
from edc_constants.constants import POS, NEG
from flourish_caregiver.helper_classes import MaternalStatusHelper
from tqdm import tqdm

In [19]:
def check_exposure(subject_identifier):
    child_consent = CaregiverChildConsent.objects.filter(
        subject_identifier=subject_identifier).first()
    maternal_status = MaternalStatusHelper(subject_identifier=subject_identifier)
    child_dataset = getattr(child_consent, 'child_dataset', None)
    if child_dataset:
        return getattr(child_dataset, 'infant_hiv_exposed', None).upper()
    else:
        hiv_status = getattr(maternal_status, 'hiv_status', None)
        return f'ANC_{hiv_status}'
    

In [20]:
def convert_to_title_case(snake_case_string):
    title_case_string = snake_case_string.replace("_", " ").title()
    return title_case_string

In [21]:
# Enrolment cohort categorization
enrol_exposure_summary = defaultdict(lambda: defaultdict(int))
enrol_qs = Cohort.objects.filter(enrollment_cohort=True).values('subject_identifier', 'name')

# Overall counts
enrol_overall = enrol_qs.values('name').annotate(count=Count('name'))
overall_df = pd.DataFrame(list(enrol_overall))

# Exposure counts
for cohort in tqdm(enrol_qs):
    exposure = check_exposure(cohort.get('subject_identifier', None))
    cohort_name = convert_to_title_case(cohort.get('name', None))
    enrol_exposure_summary[cohort_name][exposure] += 1

enrol_summary = [{'cohort_name': name, 'exposure': exposure, 'count': count}
                 for name, exposures in enrol_exposure_summary.items()
                 for exposure, count in exposures.items()]
enrol_summary_df = pd.DataFrame(enrol_summary)

100%|██████████| 1193/1193 [00:49<00:00, 24.33it/s]


In [22]:
overall_df

Unnamed: 0,name,count
0,cohort_c_sec,124
1,cohort_b_sec,332
2,cohort_c,100
3,cohort_a,433
4,cohort_b,204


In [23]:
enrol_summary_df

Unnamed: 0,cohort_name,exposure,count
0,Cohort C Sec,EXPOSED,112
1,Cohort C Sec,UNEXPOSED,12
2,Cohort B Sec,EXPOSED,331
3,Cohort B Sec,UNEXPOSED,1
4,Cohort C,EXPOSED,63
5,Cohort C,UNEXPOSED,37
6,Cohort A,ANC_UNK,283
7,Cohort A,UNEXPOSED,34
8,Cohort A,EXPOSED,116
9,Cohort B,EXPOSED,130


In [24]:
# Current cohort categorization
current_exposure_summary = defaultdict(lambda: defaultdict(int))

qs = Cohort.objects.filter(
    subject_identifier=OuterRef('subject_identifier')).order_by('-assign_datetime')
latest_cohorts = Cohort.objects.filter(pk=Subquery(qs.values('pk')[:1]))

# Overall counts
current_overall = latest_cohorts.values('name').annotate(count=Count('name'))
curr_overall_df = pd.DataFrame(list(current_overall))

for cohort in tqdm(latest_cohorts):
    exposure = check_exposure(cohort.subject_identifier)
    cohort_name = convert_to_title_case(cohort.name)
    current_exposure_summary[cohort_name][exposure] += 1

cuurent_summary = [{'cohort_name': name, 'exposure': exposure, 'count': count}
                   for name, exposures in current_exposure_summary.items()
                   for exposure, count in exposures.items()]
curr_summary_df = pd.DataFrame(cuurent_summary)

100%|██████████| 1181/1181 [00:48<00:00, 24.52it/s]


In [25]:
curr_overall_df

Unnamed: 0,name,count
0,cohort_c_sec,249
1,cohort_b,223
2,cohort_c,222
3,cohort_a,318
4,cohort_b_sec,169


In [26]:
curr_summary_df

Unnamed: 0,cohort_name,exposure,count
0,Cohort C Sec,EXPOSED,248
1,Cohort C Sec,UNEXPOSED,1
2,Cohort C,EXPOSED,122
3,Cohort C,UNEXPOSED,100
4,Cohort A,ANC_UNK,283
5,Cohort A,EXPOSED,33
6,Cohort A,UNEXPOSED,2
7,Cohort B Sec,EXPOSED,166
8,Cohort B Sec,UNEXPOSED,3
9,Cohort B,EXPOSED,172


In [27]:
# Re-classification counts
final_classification = []
cohorts = ['cohort_a', 'cohort_b', 'cohort_c', 'cohort_a_sec', 'cohort_b_sec', 'cohort_c_sec']
# Get the latest cohort `name` for each subject_identifier
qs = Cohort.objects.filter(
    subject_identifier=OuterRef('subject_identifier')).order_by('-assign_datetime')
latest_names = Cohort.objects.filter(pk=Subquery(qs.values('pk')[:1])).values('subject_identifier', 'name')

for cohort in cohorts:
    enrol_cohort = cohort
    cohort_copy = cohorts.copy()
    cohort_copy.remove(enrol_cohort)
    for curr_cohort in cohort_copy:
        # Get subject identifiers for enrol cohort `name`
        enrol_cohort_sidx = Cohort.objects.filter(
            enrollment_cohort=True, name=enrol_cohort).values_list('subject_identifier', flat=True)

        # Filter subject identifiers that had enrol_cohort `name` → current_cohort `name`
        filtered_sidx = latest_names.filter(
            name=curr_cohort,
            subject_identifier__in=list(enrol_cohort_sidx)
        ).values_list('subject_identifier', flat=True)

        # Calculate the total count for these subject identifiers
        total_count = filtered_sidx.count()
        final_classification.append({'enrolment_cohort': enrol_cohort, 'current_cohort': curr_cohort, 'count': total_count})

In [28]:
pd.DataFrame(final_classification)

Unnamed: 0,enrolment_cohort,current_cohort,count
0,cohort_a,cohort_b,84
1,cohort_a,cohort_c,0
2,cohort_a,cohort_a_sec,0
3,cohort_a,cohort_b_sec,31
4,cohort_a,cohort_c_sec,0
5,cohort_b,cohort_a,0
6,cohort_b,cohort_c,77
7,cohort_b,cohort_a_sec,0
8,cohort_b,cohort_b_sec,0
9,cohort_b,cohort_c_sec,17
