# Data Extract and upload for NADAbase

## Steps

1. Extract Episodes for the period from Communicare
2. Extract ATOM data
3. Match Assessments with Episodes
4. Generate the upload Survey.txt file.

In [8]:
# import os

import pandas as pd

# import numpy as np
import mylogger
# from nada_load import load_and_parse_csv
from data_config import EstablishmentID_Program
from utils.df_xtrct_prep import  load_and_parse_csv
from utils.fromstr import convert_format_datestr

logger = mylogger.get(__name__)
# # List of column names in the CSV
# column_names = ['ESTABLISHMENT IDENTIFIER', 'GEOGRAPHICAL LOCATION', 'PMSEpisodeID', 'PMSPersonID', 'DOB', 'DOB STATUS', 'SEX', 'COUNTRY OF BIRTH', 'INDIGENOUS STATUS', 'PREFERRED LANGUAGE', 'SOURCE OF INCOME', 'LIVING ARRANGEMENT', 'USUAL ACCOMMODATION', 'CLIENT TYPE', 'PRINCIPAL DRUG OF CONCERN', 'PDCSubstanceOfConcern', 'ILLICIT USE', 'METHOD OF USE PRINCIPAL DRUG', 'INJECTING DRUG USE', 'SETTING', 'CommencementDate', 'POSTCODE', 'SOURCE OF REFERRAL', 'MAIN SERVICE', 'EndDate', 'END REASON', 'REFERRAL TO ANOTHER SERVICE', 'FAMILY NAME', 'GIVEN NAME', 'MIDDLE NAME', 'TITLE', 'SLK', 'MEDICARE NUMBER', 'PROPERTY NAME', 'UNIT FLAT NUMBER', 'STREET NUMBER', 'STREET NAME', 'SUBURB']




## Episode Data

In [9]:
# Global variables
active_clients_start_date ='2023-07-01' 
active_clients_end_date = '2023-12-31'

start_dt = convert_format_datestr(active_clients_start_date,'%Y-%m-%d', '%d%m%Y')
end_dt = convert_format_datestr(active_clients_end_date,'%Y-%m-%d', '%d%m%Y')


ep_source_filename = f'DATS_AllPrograms_{start_dt}-{end_dt}' #'DATS_Annual_AllPrograms_FY22-23'
ep_datasource_file_path = f"./data/in/NSW_CSV/{ep_source_filename}.csv"

# extract_start_date = 20130501
# extract_end_date = 20231231

# fname = f"{extract_start_date}_{extract_end_date}" # TODO :forNaada



results_folder = "./data/out/"

In [10]:
# # List of columns we care about
columns_of_interest = ['ESTABLISHMENT IDENTIFIER', 'GEOGRAPHICAL LOCATION', 'EPISODE ID','PERSON ID', 'SPECIFY DRUG OF CONCERN', 'PRINCIPAL DRUG OF CONCERN', 'START DATE', 'END DATE', 'SLK']
rename_columns = {
    'SPECIFY DRUG OF CONCERN': 'PDCSubstanceOfConcern',
    'PRINCIPAL DRUG OF CONCERN': 'PDCCode',
    'START DATE': 'CommencementDate',
    'END DATE': 'EndDate',
    'EPISODE ID': 'PMSEpisodeID',
    'PERSON ID': 'PMSPersonID',    
}

In [11]:
# ep_df  = load_and_parse_episode_csvs(ep_datasource_file_path)
ep_df = load_and_parse_csv(ep_datasource_file_path, rename_columns, columns_of_interest, date_cols=['START DATE', 'END DATE'])
ep_df['Program'] = ep_df['ESTABLISHMENT IDENTIFIER'].map(EstablishmentID_Program)

In [12]:
ep_df.head(10)

Unnamed: 0,ESTABLISHMENT IDENTIFIER,GEOGRAPHICAL LOCATION,PMSEpisodeID,PMSPersonID,PDCCode,PDCSubstanceOfConcern,CommencementDate,EndDate,SLK,Program
0,12QQ03076,10550,1852,170,2101,Alcohol,2021-03-19,2024-01-10,USEUK190519821,SAPPHIRE
1,12QQ03076,10550,23373,143,2101,Alcohol,2021-12-16,2024-01-10,AMOAT241019791,SAPPHIRE
2,12QQ03076,10550,39191,100,3103,Methamphetamine,2022-04-12,2024-01-10,OUDES271119791,SAPPHIRE
3,12QQ03076,10550,48350,5324,2101,Alcohol,2022-08-03,2024-01-10,UCELE131119691,SAPPHIRE
4,12QQ03076,10550,51655,8876,2101,Alcohol,2022-09-08,2023-11-28,RO2AC191120001,SAPPHIRE
5,12QQ03076,10550,51318,7331,2101,Alcohol,2022-09-06,2023-12-20,RAHAR221019911,SAPPHIRE
6,12QQ03076,10550,53719,139,2101,Alcohol,2022-09-15,2024-01-10,OL2IR141219712,SAPPHIRE
7,12QQ03076,10550,54450,5526,2101,Alcohol,2022-10-06,2024-01-10,AYULA290319921,SAPPHIRE
8,12QQ03076,10550,59206,5158,2101,Alcohol,2022-11-25,2024-01-10,MIHAR301119831,SAPPHIRE
9,12QQ03076,10550,61651,242,2101,Alcohol,2022-10-11,2024-01-10,URIYL100819842,SAPPHIRE


In [13]:
len(ep_df), min(ep_df.CommencementDate), max(ep_df.CommencementDate)

(787, datetime.date(2016, 11, 11), datetime.date(2023, 12, 22))

In [14]:
ep_df['Program'] = ep_df['ESTABLISHMENT IDENTIFIER'].map(EstablishmentID_Program)
ep_df.drop(columns=['ESTABLISHMENT IDENTIFIER'], inplace=True)

In [21]:
ep_df.columns

Index(['GEOGRAPHICAL LOCATION', 'PMSEpisodeID', 'PMSPersonID', 'PDCCode',
       'PDCSubstanceOfConcern', 'CommencementDate', 'EndDate', 'SLK',
       'Program'],
      dtype='object')

## ATOM Data


In [2]:
from utils.environment import MyEnvironmentConfig
from utils.df_xtrct_prep import extract_prep_atom_data

In [3]:
# to adjust the ODC parser
extract_start_date = 20230701
extract_end_date = 20231231
active_clients_start_date ='2023-07-01' 
active_clients_end_date = '2023-12-31'
fname = f"{extract_start_date}-{extract_end_date}" # TODO :forNaada
MyEnvironmentConfig().setup('prod')

processed_df = extract_prep_atom_data(extract_start_date, extract_end_date
                                 , active_clients_start_date
                                 , active_clients_end_date
                                 , fname, purpose='NADA')

In [3]:
# processed_df = ['SLK',	'RowKey','ClientType',	'AssessmentDate', PDCSubstanceOrGambling	PDCMethodOfUse	PDCDaysInLast28	PDCUnits	PDCHowMuchPerOccasion	PDCAgeLastUsed	PDCGoals	PDC]
processed_df.columns

Index(['Program', 'RowKey', 'SLK', 'AssessmentType', 'Staff', 'AssessmentDate',
       'SDSIsAODUseOutOfControl', 'SDSDoesMissingFixMakeAnxious',
       'SDSHowMuchDoYouWorryAboutAODUse', 'SDSDoYouWishToStop',
       'SDSHowDifficultToStopOrGoWithout', 'K10Q01', 'K10Q02', 'K10Q03',
       'K10Q04', 'K10Q05', 'K10Q06', 'K10Q07', 'K10Q08', 'K10Q09', 'K10Q10',
       'K10Q11', 'K10Q12', 'K10Q13', 'K10Q14', 'Past4WkNumInjectingDays',
       'Past4WkHadCaregivingResponsibilities', 'Past4WkBeenArrested',
       'Past4WkHaveYouViolenceAbusive', 'Past4WkMentalHealth',
       'Past4WkPhysicalHealth', 'Past4WkQualityOfLifeScore',
       'Alcohol_PerOccassionUse', 'Alcohol_DaysInLast28',
       'Heroin_PerOccassionUse', 'Heroin_DaysInLast28',
       'Other Opioids_PerOccassionUse', 'Other Opioids_DaysInLast28',
       'Cocaine_PerOccassionUse', 'Cocaine_DaysInLast28',
       'Cannabis_PerOccassionUse', 'Cannabis_DaysInLast28',
       'Amphetamines_PerOccassionUse', 'Amphetamines_DaysInLast28',
  

In [4]:
processed_df.loc[processed_df['Alcohol_PerOccassionUse'].notna()][['Alcohol_DaysInLast28' ,  'Alcohol_PerOccassionUse']]


Unnamed: 0,Alcohol_DaysInLast28,Alcohol_PerOccassionUse
448,28,9.0; standard drinks units.
580,20,17.0; standard drinks units.
548,26,17.0; standard drinks units.
472,11,
1116,0,
...,...,...
105,14,
288,10,8.0; standard drinks units.
897,3,3.0; standard drinks units.
817,28,6.0; standard drinks units.


In [7]:
processed_df.K10Q12.value_counts()

K10Q12
18    8
25    3
21    2
1     2
5     2
10    2
15    2
2     2
8     2
13    1
28    1
14    1
3     1
20    1
23    1
Name: count, dtype: int64

In [4]:
# from utils.io import write_parquet
# processed_filepath = f"./data/processed/atom_NADA_{fname}.parquet"
# write_parquet(processed_df, processed_filepath) # don't force overwrite
processed_df.to_csv(f"./data/processed/atom_NADA_{fname}.csv")
# # ArrowTypeError: ("Expected bytes, got a 'int' object", 'Conversion failed for column K10Q12 with type object')

In [16]:
atom_df  = processed_df.copy()
atom_df

Unnamed: 0,Program,RowKey,SLK,AssessmentType,Staff,AssessmentDate,SDSIsAODUseOutOfControl,SDSDoesMissingFixMakeAnxious,SDSHowMuchDoYouWorryAboutAODUse,SDSDoYouWishToStop,...,Another Drug_PerOccassionUse,Another Drug_DaysInLast28,Nicotine_PerOccassionUse,Nicotine_DaysInLast28,Gambling_PerOccassionUse,Gambling_DaysInLast28,PaidWorkDays,StudyDays,PrimaryCaregiver_0-5,PrimaryCaregiver_5-15
448,MONPATH,MONPATH_INAS_20230703,ES2ES111119821,InitialAssessment,Helen.Waite,2023-07-03,2.0,2.0,2.0,2.0,...,,,4.0; cigarettes / darts units.,28,,,10,,,
593,BUTTITOUT,BUTTITOUT_INAS_20230703,IE2ET081119601,InitialAssessment,Augustine.Fordjour,2023-07-03,,,,,...,,,,,,,,,,
580,TSS,TSS_INAS_20230703,ID2IC271019501,InitialAssessment,Mimi.Avila,2023-07-03,2.0,2.0,3.0,2.0,...,,,,,,,,,,
548,BEGAPATH,BEGAPATH_ITSP_20230703,HOAIC190919782,ITSPReview,Liz.Scott,2023-07-03,1.0,3.0,1.0,1.0,...,,,,,,,23,,False,False
472,EUROPATH,EUROPATH_INAS_20230703,EWNIM060119762,InitialAssessment,Paul.Crowe,2023-07-03,0.0,0.0,0.0,0.0,...,,,,,,,,,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1104,ALONGSIDE,ALONGSIDE_ITSP_20231222,RIGNN170519632,ITSPReview,Lexxie.Jury,2023-12-22,3.0,2.0,2.0,3.0,...,,,,15,,,0,0,,
817,SAPPHIRE,SAPPHIRE_INAS_20231222,ODEMI271219932,InitialAssessment,Kylie.Wood,2023-12-22,2.0,3.0,3.0,3.0,...,,,3.0; cigarettes / darts units.,28,54.5; $$$ units.,10,0,,False,False
629,TSS,TSS_ITSP_20231222,ILION150419561,ITSPReview,Emiltus.Emmanuel,2023-12-22,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1164,GOLBGNRL,GOLBGNRL_ITSP_20231222,TEARO220820081,ITSPReview,James.Costello,2023-12-22,0.0,1.0,1.0,1.0,...,,,,,,,,,,


In [17]:
atom_df = atom_df.loc[ (atom_df.AssessmentDate>= active_clients_start_date ) & (atom_df.AssessmentDate <= active_clients_end_date)]

In [18]:
atom_df['AssessmentDate'].info()  #describe()

<class 'pandas.core.series.Series'>
Index: 1319 entries, 448 to 799
Series name: AssessmentDate
Non-Null Count  Dtype         
--------------  -----         
1319 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 20.6 KB


In [10]:
# import pandas as pd
# # Ensure that AssessmentDate is in datetime format
# atom_df['AssessmentDate'] = pd.to_datetime(atom_df['AssessmentDate'], errors='coerce')

In [19]:
# atom_df.drop(atom_df[atom_df.Program.isin(['TSS', 'ARCA', 'COCO', 'PSYNSW' ])].index, inplace=True)
non_nsw_programs = ['TSS', 'ARCA', 'COCO', 'PSYNSW' ]
atom_df = atom_df[~atom_df['Program'].isin(non_nsw_programs)] 


In [20]:
len(ep_df), len(atom_df) 

(787, 818)

In [20]:
atom_df.Program.value_counts()

Program
SAPPHIRE     157
MURMICE      136
EUROPATH     127
GOLBICE      114
GOLBGNRL      66
MONPATH       55
MURMWIO       51
MURMPP        33
BUTTITOUT     30
ALONGSIDE     23
BEGAPATH      16
BegaMHHub     10
Name: count, dtype: int64

## NADA matching

In [21]:
# from match_audit import match_assessments

matching_ndays_slack = 7

In [30]:
# def match_assessments(episodes_df, atoms_df): #pdc_substance_mapping
#     # Apply the mapping to the ESTABLISHMENT IDENTIFIER and PDCSubstanceOfConcern columns in episodes_df
#     # episodes_df['ESTABLISHMENT IDENTIFIER'] = episodes_df['ESTABLISHMENT IDENTIFIER'].map(establishment_program_mapping)
#     # episodes_df['PDCSubstanceOfConcern'] = episodes_df['PDCSubstanceOfConcern'].map(pdc_substance_mapping)

#     # Merge the dataframes on SLK and Program
#     merged_df = pd.merge(episodes_df, atoms_df, how='inner', left_on=['SLK', 'Program'], right_on=['SLK', 'Program'])

#     # Filter rows where AssessmentDate falls within CommencementDate and EndDate (or after CommencementDate if EndDate is NaN)
#     matched_df = merged_df.loc[((merged_df['AssessmentDate'] >= merged_df['CommencementDate']) & 
#                                 (merged_df['AssessmentDate'] <= merged_df['EndDate'])) |
#                                ((merged_df['AssessmentDate'] >= merged_df['CommencementDate']) & 
#                                 (merged_df['EndDate'].isna()))]

#     # Check if PDCSubstanceOfConcern matches
#     # mismatched_df = matched_df.loc[matched_df['PDCSubstanceOfConcern_x'] != matched_df['PDCSubstanceOfConcern_y']]

#     # if len(mismatched_df) > 0:
#     #     logger.info(f"There are {len(mismatched_df)} rows where PDCSubstanceOfConcern does not match.")
#     #     logger.info(mismatched_df)

#     return matched_df

In [23]:
def get_mask_datefit(row, slack_days=7):
    # Convert to datetime if not already in that format
    # assessment_date = pd.to_datetime(row['AssessmentDate'], errors='coerce')
    # commencement_date = pd.to_datetime(row['CommencementDate'], errors='coerce')
    # end_date = pd.to_datetime(row['EndDate'], errors='coerce')

    # Create a Timedelta for slack days
    slack_td = pd.Timedelta(days=slack_days)

    # Check conditions
    after_commencement = row['AssessmentDate'].date() >= (row['CommencementDate'] - slack_td)
    before_end_date = row['AssessmentDate'].date() <= (row['EndDate'] + slack_td)

    return after_commencement and before_end_date


def match_assessments(episodes_df, atoms_df, matching_ndays_slack: int):

    # Merge the dataframes on SLK and Program
    df = pd.merge(episodes_df, atoms_df, how='inner', left_on=[
                  'SLK', 'Program'], right_on=['SLK', 'Program'])

    # Filter rows where AssessmentDate falls within CommencementDate and EndDate (or after CommencementDate if EndDate is NaN)
    mask = df.apply(get_mask_datefit, slack_days=matching_ndays_slack, axis=1)
    # mask = get_mask_datefit(df['AssessmentDate'], df['CommencementDate'],
    #                         df['EndDate'], slack_days=matching_ndays_slack)
    filtered_df = df[mask]
    # matched_df = merged_df.loc[((merged_df['AssessmentDate'] >= merged_df['CommencementDate']) &
    #                             (merged_df['AssessmentDate'] <= merged_df['EndDate'])) |
    #                            ((merged_df['AssessmentDate'] >= merged_df['CommencementDate']) &
    #                             (merged_df['EndDate'].isna()))]

    # Check if PDCSubstanceOfConcern matches
    # mismatched_df = matched_df.loc[matched_df['PDCSubstanceOfConcern_x'] != matched_df['PDCSubstanceOfConcern_y']]

    # if len(mismatched_df) > 0:
    #     logger.info(f"There are {len(mismatched_df)} rows where PDCSubstanceOfConcern does not match.")
    #     logger.info(mismatched_df)

    return filtered_df

In [24]:

matched_df = match_assessments(ep_df, atom_df, matching_ndays_slack)

In [25]:
# matched_df[['SLK', 'CommencementDate', 'AssessmentDate', 'EndDate']].head(30)
matched_df

Unnamed: 0,GEOGRAPHICAL LOCATION,PMSEpisodeID,PMSPersonID,PDCCode,PDCSubstanceOfConcern,CommencementDate,EndDate,SLK,Program,RowKey,...,Another Drug_PerOccassionUse,Another Drug_DaysInLast28,Nicotine_PerOccassionUse,Nicotine_DaysInLast28,Gambling_PerOccassionUse,Gambling_DaysInLast28,PaidWorkDays,StudyDays,PrimaryCaregiver_0-5,PrimaryCaregiver_5-15
0,10550,1852,170,2101,Alcohol,2021-03-19,2024-01-10,USEUK190519821,SAPPHIRE,SAPPHIRE_ITSP_20230801,...,,,,,,,1,0,False,True
1,10550,51655,8876,2101,Alcohol,2022-09-08,2023-11-28,RO2AC191120001,SAPPHIRE,SAPPHIRE_ITSP_20230809,...,,,12.0; cigarettes / darts units.,28,,,0,,,
2,10550,51655,8876,2101,Alcohol,2022-09-08,2023-11-28,RO2AC191120001,SAPPHIRE,SAPPHIRE_ITSP_20230814,...,,,12.0; cigarettes / darts units.,28,,,0,,,
3,10550,51655,8876,2101,Alcohol,2022-09-08,2023-11-28,RO2AC191120001,SAPPHIRE,SAPPHIRE_ITSP_20231011,...,,,12.0; cigarettes / darts units.,28,,,13,,,
4,10550,51318,7331,2101,Alcohol,2022-09-06,2023-12-20,RAHAR221019911,SAPPHIRE,SAPPHIRE_INAS_20230809,...,,,,0,,,6,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,17751,84006,9752,2101,Alcohol,2023-08-21,2024-01-10,OLYES140519952,MURMPP,MURMPP_INAS_20230821,...,,,5.0; cigarettes / darts units.,28,,,20,,,
757,17751,89462,9893,3103,Methamphetamine,2023-10-03,2023-12-13,HOPOB040719811,MURMPP,MURMPP_INAS_20231003,...,,,,,,,,,False,False
758,17751,93785,9322,3103,Methamphetamine,2023-11-07,2024-01-10,RAFRA030519901,MURMPP,MURMPP_INAS_20231107,...,,,12.0; cigarettes / darts units.,28,,,20,,,
759,17751,97444,2183,7101,Cannabinoids,2023-12-07,2024-01-10,AR2EL091019852,MURMPP,MURMPP_INAS_20231207,...,,,,,,,,,,


In [39]:
# PDCSubstanceOrGambling atom_df.columns
# ep_df.columns # PDCSubstanceOfConcern # PDCCode <- from episode

# matched_df.PDC <- from ATOM
# matched_df.PDCSubstanceOfConcern <- from Episode

0              Alcohol
1              Alcohol
2              Alcohol
3              Alcohol
4              Alcohol
            ...       
738            Alcohol
739    Methamphetamine
740    Methamphetamine
741       Cannabinoids
742    Methamphetamine
Name: PDCSubstanceOfConcern, Length: 690, dtype: object

In [26]:
# list(matched_df.PDC.unique())
# list(ep_df.PDCSubstanceOfConcern.unique())
ep_df.PDCSubstanceOfConcern.value_counts()

PDCSubstanceOfConcern
Alcohol                                331
Methamphetamine                        275
Cannabinoids                           102
Cannabinoids and Related Drugs, nec     16
Nicotine/tobacco                         6
Heroin                                   6
Amphetamine                              5
Benzodiazepines, nec                     5
Cocaine                                  3
MDMA/Ecstasy                             3
Pharmaceutical opioids, nfd              2
Other Volatile Solvents, nec             2
Gambling                                 2
Opioid Antagonists, nec                  1
Dexamphetamine                           1
Alprazolam                               1
Cannabinoid agonists                     1
Buprenorphine                            1
Name: count, dtype: int64

In [46]:
# import numpy as np

# conditions = [
#     # matched_df['PDCSubstanceOfConcern'].isin(['Ethanol', 'Alcohols, n.e.c.']),
#      matched_df['PDCSubstanceOfConcern'].isin(['Alcohol']),
#     matched_df['PDCCode'].between(7100, 7199),
#     matched_df['PDCCode'].between(3100, 3199),
#     matched_df['PDCSubstanceOfConcern'].isin(['Pharmaceutical Opioids, n.f.d.']),
#     matched_df['PDCCode'].between(2400, 2499),
#     matched_df['PDCCode'] == 1202,
#     matched_df['PDCCode'].between(1100, 1399),
#     ~matched_df['PDCSubstanceOfConcern'].isin(['Nicotine', 'Cocaine'])
# ]
# # 'Alcohol', 'Cannabinoids and Related Drugs, nec', 'Heroin',
# #        'Cannabinoids', 'Benzodiazepines, nec', 'Methamphetamine',
# #        'Nicotine/tobacco', 'Cocaine', nan, 'Dexamphetamine',
# #        'Amphetamine', 'Other Volatile Solvents, nec', 'MDMA/Ecstasy',
# #        'Gambling', 'Alprazolam', 'Cannabinoid agonists', 'Buprenorphine'

# choices = [
#     'Alcohol',
#     'Cannabis',
#     'Amphetamines',
#     'Other Opioid Use',
#     'Tranquilisers',
#     'Heroin',
#     'Other Opioid Use',
#     'Another drug use'
# ]

# matched_df['PDCFinal'] = np.select(conditions, choices, default=matched_df['PDCSubstanceOfConcern'])

In [27]:
# Since the primary key in episodes_df is PMSEpisodeID, and in atoms_df it is a combination of SLK and Program, 
# you can use the groupby method on the merged_df to group by SLK and Program, 
# and then check if there is more than one unique PMSEpisodeID for any group.

# grouped = matched_df.groupby(['SLK', 'Program'])['PMSEpisodeID'].nunique()
grouped = matched_df.groupby(['SLK', 'RowKey'])['PMSEpisodeID'].nunique()
duplicates = grouped[grouped > 1]


In [28]:
duplicates

SLK             RowKey                
AHYAT141219641  SAPPHIRE_ITSP_20231128    2
Name: PMSEpisodeID, dtype: int64

In [32]:
# SLK + Program
onedupe = matched_df.loc[(matched_df.SLK =='AHYAT141219641') & (matched_df.Program == 'SAPPHIRE')]

In [29]:
onedupe [['Program', 'SLK', 'CommencementDate', 'AssessmentDate', 'EndDate', 'PMSEpisodeID', 'RowKey']]

NameError: name 'onedupe' is not defined

In [34]:

# Get the keys for the duplicate rows
duplicate_keys = duplicates.index

# Filter matched_df to show only rows that match the duplicate keys
duplicate_rows_df = matched_df[matched_df.set_index(['SLK', 'RowKey']).index.isin(duplicate_keys)]

# Show all rows in the resulting DataFrame
# pd.set_option('display.max_rows', None)  # Set this to display all rows



In [35]:
duplicate_rows_df[['SLK','RowKey',  'Program'  ,  'CommencementDate',   'EndDate' , 'PMSEpisodeID', 'AssessmentDate' ]]

Unnamed: 0,SLK,RowKey,Program,CommencementDate,EndDate,PMSEpisodeID,AssessmentDate
65,AHYAT141219641,SAPPHIRE_ITSP_20231128,SAPPHIRE,2023-07-11,2023-11-27,79760,2023-11-28
68,AHYAT141219641,SAPPHIRE_ITSP_20231128,SAPPHIRE,2023-11-28,2024-01-08,99200,2023-11-28


In [30]:
matched_df['Program'].value_counts()

Program
SAPPHIRE    146
MURMICE     125
EUROPATH    124
GOLBICE     112
GOLBGNRL     64
MONPATH      54
MURMWIO      45
MURMPP       30
BEGAPATH      9
Name: count, dtype: int64

In [43]:
# matched_df.to_csv(f"{results_folder}matched_df.csv", index=False)

In [31]:
matched_df.PMSEpisodeID.unique().shape

(510,)

In [37]:
atom_df[ (atom_df.AssessmentDate >= active_clients_start_date ) & (atom_df.AssessmentDate <= active_clients_end_date)].count()

Program                                                                            818
RowKey                                                                             818
SLK                                                                                818
AssessmentType                                                                     807
Staff                                                                              818
AssessmentDate                                                                     818
SDSIsAODUseOutOfControl                                                            756
SDSDoesMissingFixMakeAnxious                                                       756
SDSHowMuchDoYouWorryAboutAODUse                                                    756
SDSDoYouWishToStop                                                                 756
SDSHowDifficultToStopOrGoWithout                                                   756
K10Q01                                     

In [32]:
len(atom_df[ (atom_df.AssessmentDate >= active_clients_start_date ) & (atom_df.AssessmentDate <= active_clients_end_date)])

818

In [35]:
# atom_df[atom_df['SLK'] =='OLOAC010820061']['Nicotine_PerOccassionUse']

868    nan; cigarettes / darts units.
869    nan; cigarettes / darts units.
Name: Nicotine_PerOccassionUse, dtype: object

In [33]:
len(matched_df.PMSEpisodeID.unique())

510

In [40]:
len(ep_df)


1302

In [34]:
matched_df.to_csv('./data/out/NADA_Matched_20230701-2023-12-31.csv')

In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('./data/out/NADA_Matched_20230701-2023-12-31.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'GEOGRAPHICAL LOCATION', 'PMSEpisodeID', 'PMSPersonID',
       'PDCCode', 'PDCSubstanceOfConcern', 'CommencementDate', 'EndDate',
       'SLK', 'Program', 'RowKey', 'AssessmentType', 'Staff', 'AssessmentDate',
       'SDSIsAODUseOutOfControl', 'SDSDoesMissingFixMakeAnxious',
       'SDSHowMuchDoYouWorryAboutAODUse', 'SDSDoYouWishToStop',
       'SDSHowDifficultToStopOrGoWithout', 'K10Q01', 'K10Q02', 'K10Q03',
       'K10Q04', 'K10Q05', 'K10Q06', 'K10Q07', 'K10Q08', 'K10Q09', 'K10Q10',
       'K10Q11', 'K10Q12', 'K10Q13', 'K10Q14', 'Alcohol_PerOccassionUse',
       'Alcohol_DaysInLast28', 'Cannabis_PerOccassionUse',
       'Cannabis_DaysInLast28', 'Heroin_PerOccassionUse',
       'Heroin_DaysInLast28', 'Other Opioids_PerOccassionUse',
       'Other Opioids_DaysInLast28', 'Cocaine_PerOccassionUse',
       'Cocaine_DaysInLast28', 'Amphetamines_PerOccassionUse',
       'Amphetamines_DaysInLast28', 'Tranquilliser_PerOccassionUse',
       'Tranquilliser_DaysInLast28', 'A

In [44]:
atom_df

Unnamed: 0,SLK,AssessmentDate,Program,Staff,RowKey,PDC
0,ABCDT210719812,2023-10-23,BegaMHHub,Aftab.Jalal,BegaMHHub_INAS_20231023,"Benzodiazepines, n.f.d."
6,ACRAN280320011,2023-11-21,MURMWIO,Craig.Maxwell,MURMWIO_INAS_20231121,"Cannabinoids and Related Drugs, n.f.d."
11,ADIRO150619781,2023-10-23,GOLBICE,Simona.Angeli,GOLBICE_INAS_20231023,"Cannabinoids and Related Drugs, n.f.d."
16,AGIAN010119792,2023-10-05,SAPPHIRE,Molly.Reynolds,SAPPHIRE_ITSP_20231005,Ethanol
21,AHYAT141219641,2023-11-28,SAPPHIRE,Michelle.Wilkie,SAPPHIRE_ITSP_20231128,Ethanol
...,...,...,...,...,...,...
1115,YKSLA240819972,2023-11-17,ALONGSIDE,Lexxie.Jury,ALONGSIDE_ITSP_20231117,Ethanol
1118,YOBLI011019882,2023-10-30,GOLBICE,Anthony.Williams,GOLBICE_INAS_20231030,Methamphetamine
1124,YREAT240819911,2023-11-13,MURMWIO,Michele.Lumb,MURMWIO_INAS_20231113,Methamphetamine
1125,YREMY040119792,2023-10-06,MURMICE,Tina.Raccanello,MURMICE_INAS_20231006,Ethanol


In [44]:
# matched_df['Past4WkEngagedInOtheractivities.Paid Work'][0:2].apply(lambda x : x['Days'] if 'Days' in x)

0    1
1    0
Name: Past4WkEngagedInOtheractivities.Paid Work, dtype: object