In [1]:
import os
import pandas as pd
import pytz
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
from collections import defaultdict
from django.apps import apps as django_apps
from django.db.models import F, Max, Case, When, Sum
from edc_constants.constants import POS, NEG
from flourish_reports.views.enrolment.enrollment_report_mixin import EnrolmentReportMixin
from tqdm import tqdm

In [2]:
report_mixin = EnrolmentReportMixin()

In [3]:
def convert_to_title_case(snake_case_string):
    title_case_string = snake_case_string.replace("_", " ").title()
    return title_case_string

In [4]:
# Enrolment cohort categorization
enrol_total = report_mixin.enrollment_report

enrol_summary_df = pd.DataFrame(enrol_total)

enrol_summary_df

Unnamed: 0,cohort_name,unexposed,exposed
0,Cohort A,180,264
1,Cohort B,77,118
2,Cohort C,34,46
3,Cohort A Sec,0,0
4,Cohort B Sec,1,348
5,Cohort C Sec,12,116


In [5]:
# enrol_summary_df.to_excel("enrollment_summary_report.xlsx", index=False)

In [6]:
# Current cohort categorization
current_total = report_mixin.current_report

current_summary_df = pd.DataFrame(current_total)

current_summary_df

Unnamed: 0,cohort_name,unexposed,exposed
0,Cohort A,148,180
1,Cohort B,55,200
2,Cohort C,100,103
3,Cohort A Sec,0,0
4,Cohort B Sec,0,139
5,Cohort C Sec,0,270


In [7]:
# current_summary_df.to_excel("current_summary_report.xlsx", index=False)

In [8]:
def convert_to_regular_dict(d):
    if isinstance(d, defaultdict):
        d = {k: convert_to_regular_dict(v) for k, v in d.items()}
    return d

In [9]:
# Re-classification counts
final_classification = []
sequence_dict = convert_to_regular_dict(report_mixin.get_sequence)
for enrol_cohort, movements in sequence_dict.items():
    for curr_cohort, total in movements.items():
        final_classification.append(
            {'enrollment_cohort': enrol_cohort, 'current_cohort': curr_cohort, 'count': total})

recategorization_df = pd.DataFrame(final_classification)
recategorization_df

Unnamed: 0,enrollment_cohort,current_cohort,count
0,cohort_b,cohort_c,78
1,cohort_b,cohort_c_sec,15
2,cohort_b_sec,cohort_b,60
3,cohort_b_sec,cohort_c,34
4,cohort_b_sec,cohort_c_sec,139
5,cohort_c_sec,cohort_c,11
6,cohort_a,cohort_b,93
7,cohort_a,cohort_b_sec,23


In [10]:
# recategorization_df.to_excel("recategorization_report.xlsx", index=False)
# recategorization_df

In [13]:
# Cohort A Breakdown and exports
enrol_cohorts = Cohort.objects.filter(enrollment_cohort=True)
records = []
for cohort in tqdm(enrol_cohorts):
    curr_cohort = Cohort.objects.filter(
        subject_identifier=cohort.subject_identifier, current_cohort=True).order_by('-assign_datetime').first()
    study_identifier = cohort.caregiver_child_consent.study_child_identifier
    protocol = cohort.caregiver_child_consent.get_protocol
    records.append({'subject_identifier': cohort.subject_identifier,
                    'prev_study_identifier': study_identifier,
                    'prev_study_name': protocol or 'ANC',
                    'enrol_cohort': cohort.name,
                    'current_cohort': getattr(curr_cohort, 'name', None),
                    'exposure_status': cohort.exposure_status, })

cohort_a_df = pd.DataFrame(records)

100%|██████████| 1196/1196 [01:38<00:00, 12.19it/s]


In [14]:
cohort_a_df.to_excel("cohort_summary_table.xlsx", index=False)
cohort_a_df

Unnamed: 0,subject_identifier,prev_study_identifier,prev_study_name,enrol_cohort,current_cohort,exposure_status
0,B142-040990291-2-10,056-1984090-3-10,Mpepu,cohort_b,cohort_c_sec,EXPOSED
1,B142-040990747-3-10,B014260-2,Tshipidi,cohort_c,cohort_c,UNEXPOSED
2,B142-040990925-5-10,,ANC,cohort_a,cohort_a,UNEXPOSED
3,B142-040990367-0-10,085-40990296-6-10,Tshilo Dikotla,cohort_a,cohort_b,UNEXPOSED
4,B142-040990540-2-10,B005496-2,Mma Bana,cohort_c,cohort_c,EXPOSED
...,...,...,...,...,...,...
1191,B142-040990709-3-10,B006980-4,Mma Bana,cohort_c_sec,cohort_c_sec,EXPOSED
1192,B142-040990103-9-10,056-4995281-4-10,Mpepu,cohort_b,cohort_b,EXPOSED
1193,B142-040990445-4-10,,ANC,cohort_a,cohort_a,UNEXPOSED
1194,B142-040990921-4-10,,ANC,cohort_a,cohort_a,UNEXPOSED


In [75]:
# Enrollment `cohort_a` count by exposure and study
grouped_a = cohort_a_df.groupby(['prev_study_name', 'exposure_status'])
td_exposed = grouped_a.get_group(('Tshilo Dikotla', 'EXPOSED'))['subject_identifier'].count()
anc_exposed = grouped_a.get_group(('ANC', 'EXPOSED'))['subject_identifier'].count()
print(f'TD HEU, {td_exposed}')
print(f'ANC HEU, {anc_exposed}')
print(f'Total HEU, {td_exposed + anc_exposed}')

TD HEU, 116
ANC HEU, 148
Total HEU, 264


In [73]:
td_unexposed = grouped_a.get_group(('Tshilo Dikotla', 'UNEXPOSED'))['subject_identifier'].count()
anc_unexposed = grouped_a.get_group(('ANC', 'UNEXPOSED'))['subject_identifier'].count()
print(f'TD HUU, {td_unexposed}')
print(f'ANC HUU, {anc_unexposed}')
print(f'Total HUU, {td_unexposed + anc_unexposed}')

TD HUU, 34
ANC HUU, 146
Total HUU, 180


In [74]:
# Current `cohort_a` count by exposure and study
grouped_a = cohort_a_df.groupby(['prev_study_name', 'current_cohort', 'exposure_status'])
td_exposed = grouped_a.get_group(('Tshilo Dikotla', 'cohort_a', 'EXPOSED'))['subject_identifier'].count()
anc_exposed = grouped_a.get_group(('ANC', 'cohort_a', 'EXPOSED'))['subject_identifier'].count()
print(f'TD HEU, {td_exposed}')
print(f'ANC HEU, {anc_exposed}')
print(f'Total HEU, {td_exposed + anc_exposed}')

TD HEU, 32
ANC HEU, 148
Total HEU, 180


In [70]:
td_unexposed = grouped_a.get_group(('Tshilo Dikotla', 'cohort_a', 'UNEXPOSED'))['subject_identifier'].count()
anc_unexposed = grouped_a.get_group(('ANC', 'cohort_a', 'UNEXPOSED'))['subject_identifier'].count()
print(f'TD HUU, {td_unexposed}')
print(f'ANC HUU, {anc_unexposed}')
print(f'Total HUU, {td_unexposed + anc_unexposed}')

TD HUU, 2
ANC HUU, 146
Total HUU, 148


In [71]:
# Recategorization summary
cohort_names = [record.get('current_cohort') for record in records]
cohort_names = list(set(cohort_names))
cohort_names.remove('cohort_a')

study_names = ['Tshilo Dikotla', 'ANC']
grouped_a = cohort_a_df.groupby(['prev_study_name', 'current_cohort', 'exposure_status'])
for study_name in study_names:
    for cohort in cohort_names:
        heu_count = 0
        huu_count = 0
        try:
            heu_count = grouped_a.get_group((study_name, cohort, 'EXPOSED'))['subject_identifier'].count()
            huu_count = grouped_a.get_group((study_name, cohort, 'UNEXPOSED'))['subject_identifier'].count()
        except KeyError:
            pass
        if heu_count or huu_count: 
            print(f'HEU {study_name}: cohort_a → {cohort}: {heu_count}')
            print(f'HUU {study_name}: cohort_a → {cohort}: {huu_count}')
            print('\n')

HEU Tshilo Dikotla: cohort_a → cohort_b_sec: 23
HUU Tshilo Dikotla: cohort_a → cohort_b_sec: 0


HEU Tshilo Dikotla: cohort_a → cohort_b: 61
HUU Tshilo Dikotla: cohort_a → cohort_b: 32


