# Population-level trends for Client Outcome Measures

## Comparing the averages of scores for a particular set of questions First stage vs any other stage scores ATOM Survey

## Steps:

1. Extract the data - from the database or from a pre-prepared parquet file
2. Processing: if not pulling from pre-cleaned data: Clean & Transform (incl. categorize) the data
    - Clean data - remove rows with missing data : PDCSubstanceOrGambling
    - Transform data - expand PDC, determine Program, categorize fields, drop Notes/Comments fields, rename PartitionKey to SLK.
    - Limit the data to the period of interest - i.e. only clients who have completed at least one survey during the period of interest.
    - Limit by only clients who have completed the survey at least three times (min-stage: 3)
5. Calculate the average score for each client for each stage and for each of the questions of interest.


In [1]:
# Step 0: Importing the libraries

from utils.df_ops import read_parquet

In [2]:
# Global variables
extract_start_date = 20200101
extract_end_date = 20240101
fname = f"{extract_start_date}_{extract_end_date}"

active_clients_start_date ='2022-07-01' 
active_clients_end_date = '2023-06-30'

MIN_NUM_ATOMS_PER_CLIENT = 3
MIN_NUM_COL_VALUES = 3

processed_filepath = f"./data/processed/processed_all_cols_{fname}.parquet"


### Step 1 & 2: Extract & Process

In [3]:
# Step 1: Importing the dataset

## Check if a processed parquet file exists
processed_df = read_parquet(processed_filepath)

if isinstance(processed_df, type(None)) or processed_df.empty:
  print("INFO: No processed data found, loading from DB")
  from utils.io import get_data
  raw_df = get_data(extract_start_date, extract_end_date, f"./data/in/{fname}.parquet", cache=True)
  if isinstance(raw_df, type(None)) or raw_df.empty:
    print("ERROR: No data found")
    exit(1)
  
  from data_prep import prep_dataframe, limit_min_num_assessments, limit_clients_active_inperiod

  # Step 2: Clean and Transform the dataset
  processed_df = prep_dataframe(raw_df) # only one filter: PDCSubstanceOrGambling has to have a value

  processed_df = limit_clients_active_inperiod(processed_df, active_clients_start_date, active_clients_end_date)
  # Limit to only clients who have completed at least 3 survey during the period of interest.
  processed_df = limit_min_num_assessments(processed_df, MIN_NUM_ATOMS_PER_CLIENT)
  
  # cache the processed data
  processed_df.to_parquet(f"{processed_filepath}")


In [4]:
len(processed_df)

2434

### Step 3 : Calculate the average score for each client for each stage and for each of the questions of interest.

In [5]:
from utils.group_utils import getrecs_w_min_numvals_forcol, chrono_rank_within_clientgroup

def get_mean_xcontribs_of_nth_assessment_for_question(df, nth, question):
  nth_surveys = df[df['survey_rank'] == nth]
  mean_rounded = 0
  if nth_surveys[question].dtype.name == 'category':
    mean_rounded = round(nth_surveys[question].cat.codes.mean(),2)
  else:
    mean_rounded = round(nth_surveys[question].mean(),2)
  
  return mean_rounded, len(nth_surveys)

In [6]:
col = 'Past4WkHowOftenPhysicalHealthCausedProblems' #'SDS_Score'
is_categorical = True

In [7]:

col_df = getrecs_w_min_numvals_forcol(processed_df, col)
len(col_df), len(processed_df), min(processed_df.AssessmentDate), max(processed_df.AssessmentDate)

(2256,
 2434,
 Timestamp('2020-01-02 00:00:00'),
 Timestamp('2023-06-30 00:00:00'))

In [10]:
# Chronologically Rank the Assessments for each client
col_df = chrono_rank_within_clientgroup(col_df)
# g = col_df.groupby('SLK')
# col_df.loc[:,'survey_rank'] = g['AssessmentDate'].rank(method='min')

In [11]:
mean1, first_assess_contribs = get_mean_xcontribs_of_nth_assessment_for_question(col_df, 1, col)
mean4, fourth_assess_contribs = get_mean_xcontribs_of_nth_assessment_for_question(col_df, 4, col)
mean7, seventh_assess_contribs = get_mean_xcontribs_of_nth_assessment_for_question(col_df, 7, col)
mean1, first_assess_contribs , mean4, fourth_assess_contribs, mean7, seventh_assess_contribs 

# TODO: Clients with no change : treat as outliers and remove ? 
# TODO: Clients with only zeros ?

(1.42, 488, 1.4, 299, 1.39, 70)

In [12]:
# client_groups_forcol = col_df.groupby('SLK')
from graphing import get_chart_for_means

assessment_tags= ['First', 'Fourth', 'Seventh']
means = [mean1, mean4, mean7]

contribs = [first_assess_contribs,fourth_assess_contribs, seventh_assess_contribs ]
chart = get_chart_for_means(assessment_tags, means, contribs)
chart