# Population-level trends for Client Outcome Measures

## Changes in average (over clients) scores ATOM Assessment Survey questions.

## Steps:

1. Extract the data - from the database or from a pre-prepared parquet file
2. Processing: if not pulling from pre-cleaned data: Clean & Transform (incl. categorize) the data
    - Clean data - remove rows with missing data : PDCSubstanceOrGambling
    - Transform data - expand PDC, determine Program, categorize fields, drop Notes/Comments fields, rename PartitionKey to SLK.
    - Limit the data to the period of interest - i.e. only clients who have completed at least one survey during the period of interest.
    - Limit by only clients who have completed the survey at least three times (min-stage: 3)
5. Calculate the average score for each client for each stage and for each of the questions of interest.


In [2]:

# Step 0: Importing the libraries
from utils.df_xtrct_prep import extract_prep_data

# from statsutil.funcs import get_all_results
# from utils.io import write_results_to_files
# from graphing import get_chart_for_qna_list
from data_config import EstablishmentID_Program

In [3]:
import os
import pandas as pd

# List of column names in the CSV
column_names = ['ESTABLISHMENT IDENTIFIER', 'GEOGRAPHICAL LOCATION', 'PMSEpisodeID', 'PMSPersonID', 'DOB', 'DOB STATUS', 'SEX', 'COUNTRY OF BIRTH', 'INDIGENOUS STATUS', 'PREFERRED LANGUAGE', 'SOURCE OF INCOME', 'LIVING ARRANGEMENT', 'USUAL ACCOMMODATION', 'CLIENT TYPE', 'PRINCIPAL DRUG OF CONCERN', 'PDCSubstanceOfConcern', 'ILLICIT USE', 'METHOD OF USE PRINCIPAL DRUG', 'INJECTING DRUG USE', 'SETTING', 'CommencementDate', 'POSTCODE', 'SOURCE OF REFERRAL', 'MAIN SERVICE', 'EndDate', 'END REASON', 'REFERRAL TO ANOTHER SERVICE', 'FAMILY NAME', 'GIVEN NAME', 'MIDDLE NAME', 'TITLE', 'SLK', 'MEDICARE NUMBER', 'PROPERTY NAME', 'UNIT FLAT NUMBER', 'STREET NUMBER', 'STREET NAME', 'SUBURB']

# List of columns we care about
columns_of_interest = ['ESTABLISHMENT IDENTIFIER', 'GEOGRAPHICAL LOCATION', 'PMSEpisodeID', 'PMSPersonID', 'PDCSubstanceOfConcern', 'CommencementDate', 'EndDate', 'SLK']

def load_and_parse_csvs(directory):
    # List to hold dataframes
    dfs = []
    
    # Loop over all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is a CSV
        if not filename.endswith('.csv'):
          continue
        filepath = os.path.join(directory, filename)
        # Load the CSV
        df = pd.read_csv(filepath, header=None, names=column_names)
        # Select only the columns we care about
        df = df[columns_of_interest]
        # Try to convert CommencementDate and EndDate columns to datetime format
        try:
            df['CommencementDate'] = pd.to_datetime(df['CommencementDate'], format='%d%m%Y',errors='coerce')
            df['EndDate'] = pd.to_datetime(df['EndDate'], format='%d%m%Y', errors='coerce')
        except ValueError as e:
            print(f"Error parsing dates in file {filename} with error {str(e)}")
            print("The problematic row is:")
            print(df.iloc[-1])
            continue  # Skip this file and move to the next one
        # Append the dataframe to the list
        dfs.append(df)
    
    # Concatenate all dataframes in the list
    final_df = pd.concat(dfs, ignore_index=True)

    return final_df

In [4]:
# episodes data.
# from nada_load import  load_and_parse_csvs

ep_df = load_and_parse_csvs("./data/in/NSW_CSV/")

In [5]:
len(ep_df)

443

In [6]:
min(ep_df.CommencementDate), max(ep_df.CommencementDate)

(Timestamp('2020-04-07 00:00:00'), Timestamp('2023-07-30 00:00:00'))

In [13]:
df_end_dates  = ep_df[ep_df.EndDate.notna()]

In [15]:
min(df_end_dates.EndDate), max(df_end_dates.EndDate)

(Timestamp('2023-05-25 00:00:00'), Timestamp('2023-07-30 00:00:00'))

In [17]:
ep_df['Program'] = ep_df['ESTABLISHMENT IDENTIFIER'].map(EstablishmentID_Program)

In [18]:
ep_df['Program'].value_counts()

Program
EUROPATH    91
GOLBICE     83
MURMICE     63
SAPPHIRE    56
GOLBGNRL    55
MURMWIO     29
MURMPP      28
MONPATH     21
BEGAPATH    14
MURMHEAD     3
Name: count, dtype: int64

In [14]:
ep_df.PDCSubstanceOfConcern.unique()

array(['Alcohol', 'Cannabinoids', 'Methamphetamine', 'Amphetamine',
       'Heroin', 'Benzodiazepines, nec', 'Nicotine/tobacco', nan,
       'Cannabinoids and Related Drugs, nec', 'Diazepam', 'MDMA/Ecstasy',
       'Alprazolam', 'Oxycodone', 'Amphetamines, nec'], dtype=object)

In [6]:
# Create a new column for the year and month
ep_df['EndYearMonth'] = df['EndDate'].dt.to_period('M')
ep_df.groupby('EndYearMonth').size()


EndYearMonth
2023-05    17
2023-06    63
2023-07    23
Freq: M, dtype: int64

In [7]:
ep_df['EndDate'].isna().sum()

340

In [None]:
# def match_assessments(episodes_df, atoms_df, pdc_substance_mapping, establishment_program_mapping):
#     # Apply the mapping to the ESTABLISHMENT IDENTIFIER and PDCSubstanceOfConcern columns in episodes_df
#     episodes_df['ESTABLISHMENT IDENTIFIER'] = episodes_df['ESTABLISHMENT IDENTIFIER'].map(establishment_program_mapping)
#     episodes_df['PDCSubstanceOfConcern'] = episodes_df['PDCSubstanceOfConcern'].map(pdc_substance_mapping)

#     # Ensure that AssessmentDate is in datetime format
#     # atoms_df['AssessmentDate'] = pd.to_datetime(atoms_df['AssessmentDate'], errors='coerce')

#     # Merge the dataframes on SLK and Program
#     merged_df = pd.merge(episodes_df, atoms_df, how='inner', left_on=['SLK', 'ESTABLISHMENT IDENTIFIER'], right_on=['SLK', 'Program'])

#     # Filter rows where AssessmentDate falls within CommencementDate and EndDate (or after CommencementDate if EndDate is NaN)
#     matched_df = merged_df.loc[((merged_df['AssessmentDate'] >= merged_df['CommencementDate']) & 
#                                 (merged_df['AssessmentDate'] <= merged_df['EndDate'])) |
#                                ((merged_df['AssessmentDate'] >= merged_df['CommencementDate']) & 
#                                 (merged_df['EndDate'].isna()))]

#     # Check if PDCSubstanceOfConcern matches
#     mismatched_df = matched_df.loc[matched_df['PDCSubstanceOfConcern_x'] != matched_df['PDCSubstanceOfConcern_y']]

#     if len(mismatched_df) > 0:
#         print(f"There are {len(mismatched_df)} rows where PDCSubstanceOfConcern does not match.")
#         print(mismatched_df)

#     return matched_df


In [23]:
EstablishmentID_Program = {
    '820002000': 'TSS'
    ,'82A000004': 'ARCCOCO'
    ,'82A000004': 'ARCRESI'
    ,'12QQ03076': 'SAPPHIRE'
    ,'12QQ03062': 'EUROPATH'
    ,'12QQ03061': 'MONPATH'
    ,'12QQ03063': 'BEGAPATH'
    ,'13K034': 'MURMICE'
    ,'12QQ03022': 'GOLBGNRL'
    ,'13Q035': 'GOLBICE'
    ,'12KK03024': 'MURMPP'
    ,'12KK03025': 'MURMWIO'
    ,'12KK03023': 'MURMHEAD'
}


In [19]:
def match_assessments(episodes_df, atoms_df,  establishment_program_mapping):
    # Apply the mapping to the ESTABLISHMENT IDENTIFIER and PDCSubstanceOfConcern columns in episodes_df
    episodes_df['ESTABLISHMENT IDENTIFIER'] = episodes_df['ESTABLISHMENT IDENTIFIER'].map(establishment_program_mapping)
    

    # Ensure that AssessmentDate is in datetime format
    # atoms_df['AssessmentDate'] = pd.to_datetime(atoms_df['AssessmentDate'], errors='coerce')

    # Merge the dataframes on SLK and Program
    merged_df = pd.merge(episodes_df, atoms_df, how='inner', left_on=['SLK', 'ESTABLISHMENT IDENTIFIER'], right_on=['SLK', 'Program'])

    # Filter rows where AssessmentDate falls within CommencementDate and EndDate (or after CommencementDate if EndDate is NaN)
    matched_df = merged_df.loc[((merged_df['AssessmentDate'] >= merged_df['CommencementDate']) & 
                                (merged_df['AssessmentDate'] <= merged_df['EndDate'])) |
                               ((merged_df['AssessmentDate'] >= merged_df['CommencementDate']) & 
                                (merged_df['EndDate'].isna()))]


    return matched_df

In [26]:
# Global variables
extract_start_date = 20200101
extract_end_date = 20240101

fname = f"{extract_start_date}_{extract_end_date}_1_forNADA" # TODO :forNaada

active_clients_start_date ='2023-04-01' 
active_clients_end_date = '2023-06-30'

results_folder = "./data/out/"


# MIN_NUM_ATOMS_PER_CLIENT = 3
# MIN_NUM_COL_VALUES = 3

### Step 1 & 2: Extract & Process

#### Extract the data - from the database or from a pre-prepared parquet file

1. *Processed data*:
  - if processed-parquet file is not present, *get the raw data* and process it and cache it into the parquet file.
  - if yes, load the data from the parquet file.
  
2. If *Raw data* doesn't exist in the data/in/ folder as a parquet file:
  - load it from the database (Azure)
  - otherwise from the parquet file.
 
 (cache=True => try to load from a parquet file, if not present, load from the database and cache it into a parquet file)

In [27]:
# Extract & Process
# TODO : Exclude step: limit_min_num_assessments
processed_df = extract_prep_data(extract_start_date, extract_end_date
                                 , active_clients_start_date
                                 , active_clients_end_date
                                 , fname)

INFO: No processed data found, loading from raw data.
INFO: No cached data found, loading from DB
fixing PDCMethodOfUse Ingests to Ingest
fixing PDCMethodOfUse Injects to Inject
fixing PDCMethodOfUse Smokes to Smoke
fixing Past4WkDailyLivingImpacted Once or twice a week to Once or twice per week
fixing Past4WkDailyLivingImpacted Three or four times a week to Three or four times per week
fixing DoYouFeelSafeWhereYouLive Often feel unsafe. Occasionally experience violence to Often feel unsafe / Occasionally experience violence
fixing DoYouFeelSafeWhereYouLive Never feel safe. Constantly exposed to violence to Never feel safe / Constantly exposed to violence


In [34]:
len(processed_df)

1820

In [31]:
atom_df  = processed_df


### Limit ATOMs to just in period (quarter) of interest

In [36]:
atom_df = atom_df.loc[ (atom_df.AssessmentDate>= active_clients_start_date ) & (atom_df.AssessmentDate <= active_clients_end_date)]

In [40]:

atom_df.drop(atom_df[atom_df.Program.isin(['TSS', 'ARCA', 'COCO', 'PSYNSW' ])].index, inplace=True)

In [41]:
len(ep_df), len(atom_df) 

(443, 374)

In [42]:
atom_df.Program.value_counts()

Program
EUROPATH          72
MURMICE           71
GOLBICE           57
SAPPHIRE          56
GOLBGNRL          36
MURMPP            28
MURMWIO           22
BEGAPATH          13
ALONGSIDE         12
MONPATH            4
MURMHEAD           3
TSS                0
COCO               0
ARCA               0
PSYNSW             0
SO2LI131219633     0
Name: count, dtype: int64

In [43]:
matched_df = match_assessments(ep_df, atom_df, EstablishmentID_Program)

In [44]:
len(matched_df)

275

In [46]:
matched_df['ESTABLISHMENT IDENTIFIER'].value_counts()

ESTABLISHMENT IDENTIFIER
EUROPATH    54
MURMICE     50
SAPPHIRE    43
GOLBICE     42
GOLBGNRL    30
MURMPP      24
MURMWIO     17
BEGAPATH     9
MONPATH      4
MURMHEAD     2
Name: count, dtype: int64

In [47]:
ep_df.to_csv(f"{results_folder}ep_df.csv", index=False)
atom_df.to_csv(f"{results_folder}atom_df.csv", index=False)
matched_df.to_csv(f"{results_folder}matched_df.csv", index=False)

In [43]:
len(matched_df.SLK.unique())

237

In [30]:
len(matched_df.loc[matched_df.Program == 'SAPPHIRE'])

43

In [38]:
from utils.group_utils import chrono_rank_within_clientgroup

df_q = chrono_rank_within_clientgroup(matched_df)

In [41]:
df_brief = df_q[['SLK', 'AssessmentDate', 'survey_rank']]

In [42]:
df_brief.survey_rank.value_counts()

survey_rank
1.0    237
2.0     35
3.0      3
Name: count, dtype: int64

In [11]:
list(d.PDCSubstanceOrGambling.unique())

['Ethanol',
 'Cannabinoids and Related Drugs, n.f.d.',
 'Methamphetamine',
 'Heroin',
 'Pharmaceutical Opioids, n.f.d.',
 'Caffeine',
 'Amphetamines, n.f.d.',
 'Nicotine',
 'Benzodiazepines, n.f.d.',
 'Cannabinoids',
 'GHB type Drugs and Analogues, n.e.c.',
 'Opioid Antagonists, n.e.c.',
 'Methadone',
 'MDMA/Ecstasy',
 'Gamma-hydroxybutyrate',
 'Cocaine',
 'Zolpidem']

In [10]:
sapphire = d.loc[d.Program == 'SAPPHIRE']

In [11]:
len(sapphire)

28

### Step 3 : Calculate the average score for each client for each stage and for each of the questions of interest.

In [28]:
# Chronologically Rank the Assessments for each client
# df_q = chrono_rank_within_clientgroup(processed_df)  # adds 'survey_rank' column
# g = col_df.groupby('SLK')
# col_df.loc[:,'survey_rank'] = g['AssessmentDate'].rank(method='min')

In [12]:
chosen_surveys = [1, 3 ,6]

In [5]:
from filters import get_filters, apply_filters #, get_outfilename_for_filters

In [13]:
orig_filter1 = {
    'FunderName': 'Coordinaire'
}
orig_filter2 = {
   'Program':['EUROPATH']
}


orig_filter = orig_filter1



filters = get_filters(orig_filter)
# print ("before :" , processed_df.Program.value_counts())

new_df = apply_filters(processed_df, filters)
# print ("After :" , new_df.Program.value_counts())

all_results = get_all_results(new_df, chosen_surveys, filters)

# outfile_name = get_outfilename_for_filters(filters)

write_results_to_files(all_results, f"{results_folder}{fname}.csv")#, orig_filter)

# for results in all_results:
#   title_for_file = results['title'].replace(" ", "_")
#   results_filepath = f"{results_folder}{fname}_{title_for_file}.csv"
#   df = results['data']
#   write_df_to_csv(df, results_filepath)

Wellbeing measures
NRecords For Col(Past4WkPhysicalHealth): 829)#, Total:2478, 2020-01-23 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkMentalHealth): 829)#, Total:2478, 2020-01-23 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkQualityOfLifeScore): 823)#, Total:2478, 2020-01-23 00:00:00, 2023-07-06 00:00:00
Substance Use
NRecords For Col(PDCHowMuchPerOccasion): 739)#, Total:2478, 2020-01-31 00:00:00, 2023-07-06 00:00:00
NRecords For Col(PDCDaysInLast28): 1041)#, Total:2478, 2020-01-07 00:00:00, 2023-07-06 00:00:00
Problems in Life Domains
NRecords For Col(Past4WkDailyLivingImpacted): 933)#, Total:2478, 2020-01-07 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkHowOftenPhysicalHealthCausedProblems): 934)#, Total:2478, 2020-01-07 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkHowOftenMentalHealthCausedProblems): 934)#, Total:2478, 2020-01-07 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkUseLedToProblemsWithFamilyFriend): 934)#, Total:2478, 2020-01-07 0

In [6]:
def do_all(df, chosen_surveys, orig_filter, filename:str=""):
  filters = get_filters(orig_filter)
  new_df = apply_filters(df, filters)
  all_results = get_all_results(new_df, chosen_surveys, filters)
  # write_results_to_files(all_results, f"{results_folder}{fname_prepend}_{fname}.csv")#, orig_filter)
  write_results_to_files(all_results, f"{results_folder}{filename}.csv")#, orig_filter)

  return all_results


In [15]:

# orig_filter1 = {'FunderName': 'Coordinaire'}
# all_results = do_all(processed_df, chosen_surveys, orig_filter1, filename=orig_filter1['FunderName'])

# orig_filter1 = {'FunderName': 'NSW Ministry of Health'}
# all_results = do_all(processed_df, chosen_surveys, orig_filter1, filename=orig_filter1['FunderName'])


# orig_filter1 = {'FunderName': 'Murrumbidgee PHN'}
# all_results = do_all(processed_df, chosen_surveys, orig_filter1, filename=orig_filter1['FunderName'])

# orig_filter1 = {'FunderName': 'ACT Health'}
# all_results = do_all(processed_df, chosen_surveys, orig_filter1, filename=orig_filter1['FunderName'])

Wellbeing measures
NRecords For Col(Past4WkPhysicalHealth): 829)#, Total:2478, 2020-01-23 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkMentalHealth): 829)#, Total:2478, 2020-01-23 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkQualityOfLifeScore): 823)#, Total:2478, 2020-01-23 00:00:00, 2023-07-06 00:00:00
Substance Use
NRecords For Col(PDCHowMuchPerOccasion): 739)#, Total:2478, 2020-01-31 00:00:00, 2023-07-06 00:00:00
NRecords For Col(PDCDaysInLast28): 1041)#, Total:2478, 2020-01-07 00:00:00, 2023-07-06 00:00:00
Problems in Life Domains
NRecords For Col(Past4WkDailyLivingImpacted): 933)#, Total:2478, 2020-01-07 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkHowOftenPhysicalHealthCausedProblems): 934)#, Total:2478, 2020-01-07 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkHowOftenMentalHealthCausedProblems): 934)#, Total:2478, 2020-01-07 00:00:00, 2023-07-06 00:00:00
NRecords For Col(Past4WkUseLedToProblemsWithFamilyFriend): 934)#, Total:2478, 2020-01-07 0

In [10]:


chart , points = get_chart_for_qna_list(question_list, answers_df, title)

chart + points

#### Write results to CSV

In [18]:
from datetime import datetime
title_for_file = title.replace(" ", "_")
results_filepath = f"{results_folder}{fname}_{title_for_file}.csv"
# write_df_to_csv(answers_df, f"{results_folder}{fname}_{title_for_file}.csv")
#f"./data/out/results_{fname}.csv"
answers_df['ResultsTimestamp'] = datetime.now().replace(microsecond=0)
answers_df.to_csv(results_filepath, index=False, mode='a', header=True)


In [8]:

# chosen_surveys = [1, 3 ,6] 
# answer_list = get_nmeans_for_questions( question_list, processed_df, chosen_surveys)

NRecords For Col(Past4WkHowOftenPhysicalHealthCausedProblems): 931, Total:2434, 2020-01-07 00:00:00, 2023-06-29 00:00:00
Past4WkHowOftenPhysicalHealthCausedProblems,1,125,1.38
Past4WkHowOftenPhysicalHealthCausedProblems,3,125,1.31
Past4WkHowOftenPhysicalHealthCausedProblems,6,125,1.43
NRecords For Col(Past4WkHowOftenMentalHealthCausedProblems): 931, Total:2434, 2020-01-07 00:00:00, 2023-06-29 00:00:00
Past4WkHowOftenMentalHealthCausedProblems,1,125,2.04
Past4WkHowOftenMentalHealthCausedProblems,3,125,1.66
Past4WkHowOftenMentalHealthCausedProblems,6,125,1.72
NRecords For Col(Past4WkUseLedToProblemsWithFamilyFriend): 931, Total:2434, 2020-01-07 00:00:00, 2023-06-29 00:00:00
Past4WkUseLedToProblemsWithFamilyFriend,1,125,0.74
Past4WkUseLedToProblemsWithFamilyFriend,3,125,0.61
Past4WkUseLedToProblemsWithFamilyFriend,6,125,0.44
NRecords For Col(Past4WkDifficultyFindingHousing): 915, Total:2434, 2020-01-07 00:00:00, 2023-06-29 00:00:00
Past4WkDifficultyFindingHousing,1,123,0.24
Past4WkDifficu

In [9]:

title = "Problems in Life Domains"
#'Changes in average scores for "Past 4 weeks: Use let to problems in various Life domains" '
chart , points = get_chart_for_qna_list(question_list, answer_list, chosen_surveys, title)

chart + points

NameError: name 'get_chart_for_qna_list' is not defined

In [25]:
# col_df1[col_df1['survey_rank'] == 1].Past4WkPhysicalHealth.count() #.value_counts(dropna=False)
# len(col_df1[col_df1['survey_rank'] == 1].SLK.unique() )
# len(df_q[df_q['survey_rank'] == 1].SLK.unique() )


528

In [26]:
# col_df1[col_df1['survey_rank'] == 6].Past4WkPhysicalHealth.count()

# len(col_df1[col_df1['survey_rank'] == 6].SLK.unique() )
# len(df_q[df_q['survey_rank'] == 6].SLK.unique() )



138

In [12]:
## client_groups_forcol = col_df.groupby('SLK')
# from graphing import get_chart_for_means

# question_list = [question]
# assessment_tags= chosen_surveys
# means = averages

# # contribs = [first_assess_contribs,fourth_assess_contribs, seventh_assess_contribs ]
# chart = get_chart_for_means(question_list, assessment_tags, means, nth_assessment_contribs)
# chart