# Imports

In [None]:
import pandas as pd
import numpy as np
import pickle
import boto3
import datetime
import math
from collections import Counter
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

# Constants

In [None]:
#State
STATE = "***"

#Paths
BASE_PATH = "***"
ELG_PATHS = [BASE_PATH + "***"]
IND_PATHS = [BASE_PATH + "***"]
PROVIDER_PATHS = [BASE_PATH + "***"]
INDC_PATHS = [BASE_PATH + "***"]

RX_PATHS = [BASE_PATH + "***"]
CLAIM_COMMON_PATHS = [BASE_PATH + "***"]
INSTITUTIONAL_PATHS = [BASE_PATH + "***"]
DX_PATHS = [BASE_PATH + "***.txt"]

#Times
START_OF_RECORDS = datetime.datetime.fromisoformat("2015-01-01")
END_OF_RECORDS = datetime.datetime.fromisoformat("2019-09-18")
BACK_BUFFER = datetime.timedelta(0*30)
FORWARD_BUFFER = datetime.timedelta(9*30)
ACUTE_BUFFER = datetime.timedelta(3*30)

#Interesting table columns
RX_VARIABLES = ['CLAIM_ID', 'DAYS_SUPPLY_NUM', 'DISPENSE_QTY_AMT', 'FILL_NUM', 'PRODUCT_CD', 'RX_NUM', 'MA_BILLED_AMT', 'MA_PAID_AMT', 'RX_WRITTEN_DT', 'ICN_NUM', 'REFILL_AUTH_AMT']
INSTITUTIONAL_VARIABLES = ['CLAIM_ID', 'ADMIT_DIAG_CD', 'ADMIT_DT', 'ADMIT_TYPE_CD', 'BILL_TYPE_CD','DISCHARG_DT', 'PROCEDURE_CD', 'PTNT_STATUS_CD', 'CLAIM_DOS_YR', 'ELIG_IP_DAYS_NUM', 'HOSPITAL_ADMIT_CD', 'ADMIT_HR', 'ADMIT_SOURCE_CD', 'PRIN_DIAG_CODE', 'ICN_NUM', 'SURGERY_DT', 'TRAUMA_IND']
DX_VARIABLES = ['CLAIM_ID', 'SEQ_NUM', 'DIAG_CD', 'DIAG_QUALIFY_CD', 'CLAIM_DOS_YR', 'DIAG_TYPE_CD', 'ICN_NUM']
IND_VARIABLES = ['INDV_ID', 'MA_NUM', 'DOB_DT', 'MA_START_DT', 'MA_STOP_DT', 'GENCD_RF', 'EGPCD_RF', 'AIDCAT_RF', 'LNGCD_RF']
INDC_VARIABLES = ['INDV_ID','CITY_TXT','ZIP_CD']
PROVIDER_VARIABLES = ['CLIENT_CD', 'PROVIDER_ID', 'MA_PROVIDER_ID', 'PRVTP_RF', 'SPTCD_RF', 'PROVIDER_ORG_NM', 'ORIG_PROV_SPEC_CD', 'ORIG_PROV_TYPE_CD', 'NPI_NUM', 'ORIG_PROVIDER_ID', 'PROVIDER_FILE_TYPE']
COMMON_VARIABLES = ['SRVC_PROVIDER_NPI_ID','CLAIM_ID', 'ICN_NUM', 'ADJUST_IND', 'ADJUST_REASON_CD', 'CLAIM_FROM_DT', 'CLAIM_TYPE_CD', 'MA_BILLED_AMT', 'MA_NUM', 'MA_PAID_AMT', 'PTNT_DOB_DT', 'PTNT_GENDER_CD', 'REF_PROVIDER_ID', 'SRVC_PROVIDER_ID', 'ORIG_SRVC_PROV_TYPE_CD','SRVC_PROV_TYPE_CD', 'PTNT_PREG_IND', 'POLICY_NUM', 'PTNT_COPAY_AMT', 'CLINICAL_SIGNIF_CD', 'COUNTY_CD', 'COVERED_AMT', 'DEDUCTABLE_AMT', 'PLACE_OF_SRVC_CD', "AID_CATEGORY_CD", "TRANS_STATUS_CD"]

In [None]:
 try:
    opioid_naive_info_known
except NameError:
    pickle_in = open(STATE + "/opioid_naive_info_known","rb")
    opioid_naive_info_known = pickle.load(pickle_in)
    

common_filter = pd.read_csv(STATE + "/common_filter.csv", usecols = ['CLAIM_ID','CLAIM_TYPE_CD','SRVC_PROVIDER_ID','ORIG_SRVC_PROV_TYPE_CD','PTNT_PREG_IND','COUNTY_CD','PLACE_OF_SRVC_CD','OUTCOME','MA_NUM','CLAIM_FROM_DT'])
common_filter.CLAIM_FROM_DT = common_filter.CLAIM_FROM_DT.map(lambda x: datetime.datetime.fromisoformat(x))


#Filter for prescriptions that are before the rx date and after the 6 months before rx date
common_filter_pre_period = common_filter[([x[0][2] for x in common_filter.MA_NUM.map(opioid_naive_info_known)] > common_filter.CLAIM_FROM_DT)
                                & ([x[0][2]  - BACK_BUFFER for x in common_filter.MA_NUM.map(opioid_naive_info_known)] < common_filter.CLAIM_FROM_DT)]
common_filter_pre_period = common_filter_pre_period.loc[common_filter_pre_period.MA_NUM.isin(opioid_naive_info_known.keys())]

del common_filter

#### NDC Codes

In [None]:
#Numpy matrix of NDC codes and metadata for opioid drugs, as determined by the CDC
code_reference = pd.read_csv("~/Resources/CDC_Opioids.csv").values
#Numpy array for column specific to NDC codes for opioid drugs
numeric_opioid_ndc = code_reference[:,1]
#Numpy matrix of NDC codes and metadata for opioid drugs with less abuse potential, as determined by the CDC
abuse_deter_reference = pd.read_csv("Resources/abuse_deterent.csv").values
#Numpy array for column specific to NDC codes for abuse deterent opioid drugs
numeric_deter_ndc = abuse_deter_reference[:,1]
opioid_numeric_total = set(numeric_opioid_ndc).union(numeric_deter_ndc)
len(opioid_numeric_total)

First, we need to find all opioid prescriptions in the rx claims table. Ideally, we could then identify all individuals in the records with an opioid prescription Hx. However, there is no individual identifier as of now, so it's broken down into three steps: 
1. Identify all claims
2. Map out claims to individual identifier (MA_NUM as of now)
3. Filter tables based on these individual identifiers

# 0. Identify all RX claims

Go through the RX table and record all claims that have an NDC code for an opioid

In [None]:
opioid_rx_claims = set()
temp_opioid_rx_claims = []
CHUNKSIZE = 100000
est_rx_processed = 0

for RX_PATH in RX_PATHS:
    for gm_chunk in pd.read_csv(RX_PATH, sep = '~', chunksize = CHUNKSIZE, usecols = ['PRODUCT_CD','CLAIM_ID'],
                               dtype = {'PRODUCT_CD':str, 'CLAIM_ID':str}):
        temp_opioid_rx_claims.extend(gm_chunk.loc[pd.to_numeric(gm_chunk.PRODUCT_CD.str.replace('\D',''),errors = 'coerce').isin(opioid_numeric_total), 'CLAIM_ID'])
        est_rx_processed += CHUNKSIZE

    opioid_rx_claims = set(temp_opioid_rx_claims)
    
print("Rows processed: {}".format(est_rx_processed))
print("Opioid Claims identified: {}".format(len(opioid_rx_claims)))

del temp_opioid_rx_claims
opioid_rx_claims_file = open(STATE + "/opioid_rx_claims", "wb")
pickle.dump(opioid_rx_claims, opioid_rx_claims_file)
opioid_rx_claims_file.close()

# 1. Identify all MA's

In [None]:
try:
    opioid_rx_claims
except NameError:
    pickle_in = open(STATE + "/opioid_rx_claims","rb")
    opioid_rx_claims = pickle.load(pickle_in)

    
CHUNKSIZE = 1000000
chunks_processed = 0

pd.DataFrame(columns = ['MA_NUM','CLAIM_ID','CLAIM_FROM_DT']).to_csv(STATE + '/opioid_rx_mas.csv', index = False)

for CLAIM_COMMON_PATH in CLAIM_COMMON_PATHS:
    for gm_chunk in pd.read_csv(CLAIM_COMMON_PATH, sep = '~', chunksize = CHUNKSIZE,
                                usecols = ['MA_NUM','CLAIM_ID','CLAIM_FROM_DT'],
                                dtype={'MA_NUM': str, 'CLAIM_ID':str, 'CLAIM_FROM_DT':str}):
        gm_chunk.loc[gm_chunk.CLAIM_ID.isin(opioid_rx_claims)].to_csv(STATE + '/opioid_rx_mas.csv', columns = ['MA_NUM','CLAIM_ID','CLAIM_FROM_DT'], mode='a',index = False, header = False)

        chunks_processed += 1
        if(chunks_processed % 100 == 0):
            print("Done with {} chunks".format(chunks_processed))

print("Rows processed: {}".format(chunks_processed))


In [None]:
opioid_rx_mas = pd.read_csv(STATE + "/opioid_rx_mas.csv")
print("Total opioid hx individuals: {}".format(len(opioid_rx_mas)))

# 2 - Map MAs to list of opioid claims

In [None]:
try:
    opioid_rx_mas
except NameError:
    opioid_rx_mas = pd.read_csv(STATE + "/opioid_rx_mas.csv")

mas_to_opioid_rx = {}
for row in opioid_rx_mas.itertuples():
    mas_to_opioid_rx.setdefault(row.MA_NUM, [])
    mas_to_opioid_rx[row.MA_NUM].append((row.CLAIM_ID,row.CLAIM_FROM_DT))

mas_to_opioid_rx = {ma: sorted(mas_to_opioid_rx[ma], key = lambda tup: tup[1]) for ma in mas_to_opioid_rx}

mas_to_opioid_rx_file = open(STATE + "/mas_to_opioid_rx", "wb")
pickle.dump(mas_to_opioid_rx, mas_to_opioid_rx_file)
mas_to_opioid_rx_file.close()

In [None]:
print("Number of individuals identified with an opioid prescription: \n{}".format(opioid_rx_mas.MA_NUM.nunique()))

In [None]:
print("Average number of opioid prescriptions: {}".format(np.mean([len(mas_to_opioid_rx[ma]) for ma in mas_to_opioid_rx])))

In [None]:
plt.hist([len(mas_to_opioid_rx[ma]) for ma in mas_to_opioid_rx])
plt.show()

# 3 - Get Eligibility info for MAs

In [None]:
try:
    mas_to_opioid_rx
except NameError:
    pickle_in = open(STATE + "/mas_to_opioid_rx","rb")
    mas_to_opioid_rx = pickle.load(pickle_in)


opioid_naive_info = {}
chunks = 0
num_elig_start_nans = 0
num_elig_stop_nans = 0

for ELG_PATH in ELG_PATHS:
    for gm_chunk in pd.read_csv(ELG_PATH, sep = '~', chunksize = 1000000,
                               usecols = ['POLICY_NUM','POLICY_START_DT','POLICY_END_DT','AIDCT_RF'],
                               dtype={'POLICY_NUM': str, 'POLICY_START_DT':str, 'POLICY_END_DT':str, 'AIDCT_RF':str}):
        curr_filter = gm_chunk.loc[gm_chunk.POLICY_NUM.isin(mas_to_opioid_rx)]
        for row in curr_filter.itertuples():
            ma = row.POLICY_NUM
            claim = mas_to_opioid_rx[row.POLICY_NUM][0][0]
            aidcat = row.AIDCT_RF
            initial_rx_date = datetime.datetime.fromisoformat(mas_to_opioid_rx[ma][0][1])

            start_or_na = row.POLICY_START_DT
            if (start_or_na != start_or_na):
                elig_start = datetime.datetime(datetime.MAXYEAR,1,1)
                num_elig_start_nans += 1
            else:
                elig_start = datetime.datetime.fromisoformat(start_or_na)

            end_or_na = row.POLICY_END_DT
            if (end_or_na != end_or_na):
                elig_end = datetime.datetime(datetime.MAXYEAR,1,1)
                num_elig_stop_nans += 1
            else:
                elig_end = datetime.datetime.fromisoformat(end_or_na)


            if (elig_start <= initial_rx_date) and (elig_end >= initial_rx_date):
                chronic_flag = -2
                if (elig_start <= initial_rx_date - BACK_BUFFER) and (START_OF_RECORDS <= initial_rx_date - BACK_BUFFER):
                    chronic_flag = -1
                    if (elig_end >= initial_rx_date + FORWARD_BUFFER)  and (END_OF_RECORDS >= initial_rx_date + FORWARD_BUFFER):
                        chronic_flag = 0
                        for i in range(1,len(mas_to_opioid_rx[row.POLICY_NUM])):
                            rx_next = datetime.datetime.fromisoformat(mas_to_opioid_rx[row.POLICY_NUM][i][1])
                            if (rx_next < initial_rx_date + FORWARD_BUFFER and rx_next >= initial_rx_date + ACUTE_BUFFER):
                                chronic_flag = 1
                                break

                opioid_naive_info.setdefault(ma, [])
                opioid_naive_info[ma].append((elig_start, elig_end, initial_rx_date, claim, chronic_flag, aidcat))

        chunks+=1
        if(chunks%50 == 0):
            print("Done with {} chunks".format(chunks))
        
print('num_elig_start_nans = {}'.format(num_elig_start_nans))
print('num_elig_stop_nans = {}'.format(num_elig_stop_nans))

opioid_naive_info_file = open(STATE + "/opioid_naive_info", "wb")
pickle.dump(opioid_naive_info, opioid_naive_info_file)
opioid_naive_info_file.close()

In [None]:
missing_prior = 0
missing_post = 0
acute = 0
chronic = 0
for ma in opioid_naive_info:
    flag = opioid_naive_info[ma][0][4]
    if (flag == -2):
        missing_prior += 1
    elif (flag == -1):
        missing_post += 1
    elif (flag == 0):
        acute += 1
    elif (flag == 1):
        chronic += 1
print("Number of eligible opioid naive individuals = {}".format(len(opioid_naive_info)))
print("Missing Prior = {}".format(missing_prior))
print("Missing Post = {}".format(missing_post))
print("acute = {}".format(acute))
print("chronic = {}".format(chronic))

In [None]:
opioid_naive_info_known = {ma: opioid_naive_info[ma] for ma in opioid_naive_info if opioid_naive_info[ma][0][4] >= 0}

In [None]:
len(opioid_naive_info_known)

In [None]:
print("Most common medicaid categories before eligibility requirement:")
["{}: {:.1%}".format(cat[0],cat[1]/len(opioid_naive_info)) for cat in Counter([opioid_naive_info[ma][0][5] for ma in opioid_naive_info]).most_common(10)]

In [None]:
print("Most common medicaid categories after eligibility requirement:")
["{}: {:.1%}".format(cat[0],cat[1]/len(opioid_naive_info_known)) for cat in Counter([opioid_naive_info_known[ma][0][5] for ma in opioid_naive_info_known]).most_common(10)]

In [None]:
opioid_naive_info_known_file = open(STATE + "/opioid_naive_info_known", "wb")
pickle.dump(opioid_naive_info_known, opioid_naive_info_known_file)
opioid_naive_info_known_file.close()

# Create a filtered Claim to MA dict

In [None]:
try:
    opioid_rx_mas
except NameError:
    opioid_rx_mas = pd.read_csv(STATE + "/opioid_rx_mas.csv")

try:
    opioid_naive_info_known
except NameError:
    pickle_in = open(STATE + "/opioid_naive_info_known","rb")
    opioid_naive_info_known = pickle.load(pickle_in)
    
filter_opioid_claim_mas = opioid_rx_mas.loc[opioid_rx_mas.MA_NUM.isin(opioid_naive_info)]
opioid_claim_ma_dict = dict(zip(filter_opioid_claim_mas.CLAIM_ID, filter_opioid_claim_mas.MA_NUM))


# 4. Filter tables 
using only ma's with eligibility, strip other tables to only necessary rows

Pickle in

In [None]:
try:
    opioid_naive_info_known
except NameError:
    pickle_in = open(STATE + "/opioid_naive_info_known","rb")
    opioid_naive_info_known = pickle.load(pickle_in)

# Claim Common Filter

In [None]:
#also get list of providers:
CHUNKSIZE = 1000000
i = 0
pd.DataFrame(columns = COMMON_VARIABLES).to_csv(STATE + "/common_filter.csv", index = False)
relevant_claims = set(opioid_naive_info_known.keys())
for CLAIM_COMMON_PATH in CLAIM_COMMON_PATHS:
    for gm_chunk in pd.read_csv(CLAIM_COMMON_PATH, sep = '~', chunksize = CHUNKSIZE, low_memory=False, usecols = COMMON_VARIABLES):
        gm_chunk.loc[gm_chunk.MA_NUM.isin(relevant_claims),COMMON_VARIABLES].to_csv(STATE + "/common_filter.csv", columns = COMMON_VARIABLES, mode='a',index = False, header = False)

        i += 1
        if (i % 10 == 0):
            print("Done with {} chunks".format(i))

print("Chunking complete: Total Records processed: {}".format(CHUNKSIZE*i))
common_filter = pd.read_csv(STATE + "/common_filter.csv")
common_filter.SRVC_PROVIDER_ID.to_csv(STATE + "/provider_ids.csv",index = False)
common_filter.loc[:,'OUTCOME'] = [opioid_naive_info_known[ma][0][4] for ma in common_filter.MA_NUM]
common_filter.loc[:,'AIDCT_RF'] = [opioid_naive_info_known[ma][0][5] for ma in common_filter.MA_NUM]
common_filter.to_csv(STATE + "/common_filter.csv",index = False)

common_filter = common_filter.loc[:,['CLAIM_ID','MA_NUM']]

full_claim_ma_dict = dict(zip(common_filter.CLAIM_ID, common_filter.MA_NUM))
full_claim_ma_dict_file = open(STATE + "/full_claim_ma_dict", "wb")
pickle.dump(full_claim_ma_dict, full_claim_ma_dict_file)
full_claim_ma_dict_file.close()

del common_filter

Refine dictionary for only claims related to eligible opioid naive ma's

### RX Filter

In [None]:
try:
    full_claim_ma_dict
except NameError:
    pickle_in = open(STATE + "/full_claim_ma_dict","rb")
    full_claim_ma_dict = pickle.load(pickle_in)

In [None]:
CHUNKSIZE = 1000000
i = 0
rx_filter = pd.DataFrame()
pd.DataFrame(columns = RX_VARIABLES).to_csv(STATE + "/rx_filter.csv", index = False)

relevant_claims = set(full_claim_ma_dict.keys())
for RX_PATH in RX_PATHS:
    for gm_chunk in pd.read_csv(RX_PATH, sep = '~', chunksize = CHUNKSIZE, low_memory=False, usecols = RX_VARIABLES):
        gm_chunk.loc[gm_chunk.CLAIM_ID.isin(relevant_claims),RX_VARIABLES].to_csv(STATE + "/rx_filter.csv", columns = RX_VARIABLES, mode='a',index = False, header = False)
        i += 1
        if (i % 10 == 0):
            print("Done with {} chunks".format(i))
        
rx_filter = pd.read_csv(STATE + "/rx_filter.csv")
rx_filter.loc[:,'MA_NUM'] = [full_claim_ma_dict[claim] for claim in rx_filter.CLAIM_ID]
rx_filter.loc[:,'OUTCOME'] = [opioid_naive_info_known[ma][0][4] for ma in rx_filter.MA_NUM]
rx_filter.to_csv(STATE + "/rx_filter.csv",index = False)

del rx_filter

### Institutional Filter

In [None]:
try:
    full_claim_ma_dict
except NameError:
    pickle_in = open(STATE + "/full_claim_ma_dict","rb")
    full_claim_ma_dict = pickle.load(pickle_in)

In [None]:
CHUNKSIZE = 1000000
i = 0
pd.DataFrame(columns = INSTITUTIONAL_VARIABLES).to_csv(STATE + "/institutional_filter.csv", index = False)
relevant_claims = set(full_claim_ma_dict.keys())

for INSTITUTIONAL_PATH in INSTITUTIONAL_PATHS:
    for gm_chunk in pd.read_csv(INSTITUTIONAL_PATH, sep = '~', chunksize = CHUNKSIZE, low_memory=False, usecols = INSTITUTIONAL_VARIABLES):
        gm_chunk.loc[gm_chunk.CLAIM_ID.isin(relevant_claims),INSTITUTIONAL_VARIABLES].to_csv(STATE + "/institutional_filter.csv", columns = INSTITUTIONAL_VARIABLES, mode='a',index = False, header = False)

        i += 1
        if (i % 10 == 0):
            print("Done with {} chunks".format(i))

institutional_filter = pd.read_csv(STATE + "/institutional_filter.csv")
institutional_filter.loc[:,'MA_NUM'] = [full_claim_ma_dict[claim] for claim in institutional_filter.CLAIM_ID]
institutional_filter.loc[:,'OUTCOME'] = [opioid_naive_info_known[ma][0][4] for ma in institutional_filter.MA_NUM]
institutional_filter.groupby('OUTCOME').mean()
institutional_filter.to_csv(STATE + "/institutional_filter.csv",index = False)

del institutional_filter

### DX Filter

In [None]:
try:
    full_claim_ma_dict
except NameError:
    pickle_in = open(STATE + "/full_claim_ma_dict","rb")
    full_claim_ma_dict = pickle.load(pickle_in)

In [None]:
CHUNKSIZE = 1000000
i = 0
pd.DataFrame(columns = DX_VARIABLES).to_csv(STATE + "/dx_filter.csv", index = False)
relevant_claims = set(full_claim_ma_dict.keys())

for DX_PATH in DX_PATHS:
    for gm_chunk in pd.read_csv(DX_PATH, sep = '~', chunksize = CHUNKSIZE, low_memory=False, usecols = DX_VARIABLES):
        gm_chunk.loc[gm_chunk.CLAIM_ID.isin(relevant_claims),DX_VARIABLES].to_csv(STATE + "/dx_filter.csv", columns = DX_VARIABLES, mode='a',index = False, header = False)

        i += 1
        if (i % 1 == 0):
            print("Done with {} chunks".format(i))

dx_filter = pd.read_csv(STATE + "/dx_filter.csv")
dx_filter.loc[:,'MA_NUM'] = [full_claim_ma_dict[claim] for claim in dx_filter.CLAIM_ID]
dx_filter.loc[:,'OUTCOME'] = [opioid_naive_info_known[ma][0][4] for ma in dx_filter.MA_NUM]

dx_filter.to_csv(STATE + "/dx_filter.csv",index = False)

del dx_filter

### IND Filter

In [None]:
CHUNKSIZE = 1000000
i = 0
pd.DataFrame(columns = IND_VARIABLES).to_csv(STATE + "/ind_filter.csv", index = False)
relevant_claims = set(opioid_naive_info_known.keys())
for IND_PATH in IND_PATHS:
    for gm_chunk in pd.read_csv(IND_PATH, sep = '~', chunksize = CHUNKSIZE, low_memory=False, usecols = IND_VARIABLES):
        gm_chunk.loc[gm_chunk.MA_NUM.isin(relevant_claims),IND_VARIABLES].to_csv(STATE + "/ind_filter.csv", columns = IND_VARIABLES, mode='a',index = False, header = False)

        i += 1
        if (i % 1 == 0):
            print("Done with {} chunks".format(i))
        
ind_filter = pd.read_csv(STATE + "/ind_filter.csv")
ind_filter.loc[:,'OUTCOME'] = [opioid_naive_info_known[ma][0][4] for ma in ind_filter.MA_NUM]
ind_filter.groupby('OUTCOME').mean()
ind_filter.to_csv(STATE + "/ind_filter.csv",index = False)

del ind_filter

# INDC Filter

In [None]:
CHUNKSIZE = 1000000
i = 0
pd.DataFrame(columns = INDC_VARIABLES).to_csv(STATE + "/indc_filter.csv", index = False)
indv_ids = set(pd.read_csv(STATE + "/ind_filter.csv").INDV_ID)
for INDC_PATH in INDC_PATHS:
    for gm_chunk in pd.read_csv(INDC_PATH, sep = '~', chunksize = CHUNKSIZE, low_memory=False, usecols = INDC_VARIABLES):
        gm_chunk.loc[gm_chunk.INDV_ID.isin(indv_ids),INDC_VARIABLES].to_csv(STATE + "/indc_filter.csv", columns = INDC_VARIABLES, mode='a',index = False, header = False)

        i += 1
        if (i % 10 == 0):
            print("Done with {} chunks".format(i))
print("DONE")

# Provider Filter

In [None]:
provider_ids = pd.read_csv(STATE + "/provider_ids.csv").SRVC_PROVIDER_ID

CHUNKSIZE = 1000000
i = 0
provider_filter = pd.DataFrame()
for PROVIDER_PATH in PROVIDER_PATHS:
    for gm_chunk in pd.read_csv(PROVIDER_PATH, sep = '~', chunksize = CHUNKSIZE, low_memory=False, usecols = PROVIDER_VARIABLES):
        provider_filter = provider_filter.append(gm_chunk.loc[gm_chunk.PROVIDER_ID.isin(provider_ids),PROVIDER_VARIABLES], ignore_index=True)

        i += 1
        if (i % 10 == 0):
            print("Done with {} chunks".format(i))
        
provider_filter.to_csv(STATE + "/provider_filter.csv",index = False)

del provider_filter

# Files you should end up with at the end of this step:
provider_filter.csv <br />
provider_ids.csv <br />
common_filter.csv <br />
ind_filter.csv <br />
dx_filter.csv <br />
institutional_filter.csv <br />
rx_filter.csv <br />
opioid_claim_ma_dict (pickled dictionary of opioid claims to their ma's) <br />
opioid_naive_info (pickled dictionary of opioid naive ma's to a list of their opioid rx claims and the corresponding dates, and eligibility periods of initial rx <br />
mas_to_opioid_rx (pickled dict of mas to their opioid claim numbers) <br />
opioid_rx_mas.csv (opioid claim to ma csv file) <br />
opioid_rx_claims (pickled set of opioid claims) <br />
full_claim_ma_dict (all claims for all ma's with opioid rx hx)