In [1]:
# create a dataframe
import boto3
import io
import pandas as pd
import time
from io import StringIO
import numpy as np

In [2]:
def process_partD(path, year):
    '''
    Input: source path to raw file and year
    Process: import table as a pd.DatafFrame and add `year` column
    Output: pd.DatafFrame
    '''
    chunksize = 10 ** 6
    chunks_df = []
    for chunk in pd.read_csv(path, sep="\t", low_memory=False, chunksize=chunksize):
        # append a year column
        chunk['year'] = pd.Series([year] * len(chunk))
        chunks_df.append(chunk)
    df = pd.concat(chunks_df, axis=0)
    return df

In [3]:
s3 = boto3.client('s3')
bucket='dast1healthcare' # Or whatever you called your bucket

path_tuples = [("PartD_Prescriber_PUF_NPI_Drug_13.txt", 2013),
               ("PartD_Prescriber_PUF_NPI_Drug_14.txt", 2014),
               ("PartD_Prescriber_PUF_NPI_Drug_15.txt", 2015),
               ("PartD_Prescriber_PUF_NPI_Drug_16.txt", 2016)]

df_list = []
for path in path_tuples:
    data_key = path[0] # Where the file is within your bucket
    data_location = 's3://{}/{}'.format(bucket, data_key)
    df_list.append(process_partD(data_location, path[1]))
df = pd.concat(df_list, axis=0)

In [None]:
'''
# save the huge merged part D data frame
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'medicare_part_d.csv').put(Body=csv_buffer.getvalue())
'''

In [5]:
# find unique ids
unique_id = np.array(list(set(zip(df["npi"].values,
                                  df["nppes_provider_last_org_name"].values, 
                                  df["nppes_provider_first_name"].values,
                                  df["specialty_description"].values,
                                  df["nppes_provider_city"].values,
                                  df["nppes_provider_state"].values)))).T
id_df = pd.DataFrame(unique_id.T, columns=["npi","last/org name","first name","specialty","city","state"])

In [6]:
'''
# save unique ids file
csv_buffer = StringIO()
id_df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'identification_part_d.csv').put(Body=csv_buffer.getvalue())
'''

{'ResponseMetadata': {'RequestId': '26B7F00800D4F572',
  'HostId': 'n+2X9eyEwTCPJqIquVAbLUmur2cv30QEiy6IaSCZ1I8oYcMIcWREPjA67jwRoccj8Aj6yfTD9zE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'n+2X9eyEwTCPJqIquVAbLUmur2cv30QEiy6IaSCZ1I8oYcMIcWREPjA67jwRoccj8Aj6yfTD9zE=',
   'x-amz-request-id': '26B7F00800D4F572',
   'date': 'Thu, 09 Aug 2018 03:31:40 GMT',
   'etag': '"caccba3b387b33f199be5bafe71e49e4"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"caccba3b387b33f199be5bafe71e49e4"'}

---
## Start building a feature matrix

### What's going to be in the rows?

In [7]:
unique_ids = list(set(zip(*id_df.drop(columns=['npi']).as_matrix().T.tolist())))
unique_names = list(set(zip(*id_df[['last/org name','first name']].as_matrix().T.tolist())))
print("length of unique items: ",id_df.shape[0])
print("length of unique ids: ",len(unique_ids))
print("length of unique names: ",len(unique_names))
print("length of unique npi: ", len(set(id_df['npi'].values)))

length of unique items:  1296170
length of unique ids:  1295207
length of unique names:  907377
length of unique npi:  1075934


**We're going to go with unique npis in the rows for now!**

In [8]:
df1 = df.drop(columns=['nppes_provider_last_org_name',
                       'nppes_provider_first_name',
                       'nppes_provider_city',
                       'nppes_provider_state',
                       'specialty_description'])

In [9]:
df1.head()

Unnamed: 0,npi,description_flag,drug_name,generic_name,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65,year
0,1003000126,S,ISOSORBIDE MONONITRATE ER,ISOSORBIDE MONONITRATE,,11,11.0,307,171.59,,*,,*,,,,2013.0
1,1003000126,S,LEVOFLOXACIN,LEVOFLOXACIN,26.0,26,26.0,165,227.1,15.0,,15.0,,15.0,106.0,159.72,2013.0
2,1003000126,S,LISINOPRIL,LISINOPRIL,17.0,19,19.0,570,100.37,,#,,#,,,,2013.0
3,1003000126,S,METOPROLOL TARTRATE,METOPROLOL TARTRATE,28.0,30,31.0,916,154.65,,#,,#,,,,2013.0
4,1003000126,S,PREDNISONE,PREDNISONE,14.0,14,14.0,133,44.72,,*,,*,,,,2013.0


### What's going to be in the columns?

In [10]:
unique_generic_names = list(set(df1["generic_name"].values))
unique_drug_names = list(set(df1["drug_name"].values))
print("number of unique drugs by generic_name: ", len(unique_generic_names))
print("number of unique drugs by drug_name: ", len(unique_drug_names))

number of unique drugs by generic_name:  2000
number of unique drugs by drug_name:  3394


**We're going to go with generic_name in the columns for now**

In [11]:
df2 = df1.drop(columns=['drug_name'])
df2.head()

Unnamed: 0,npi,description_flag,generic_name,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65,year
0,1003000126,S,ISOSORBIDE MONONITRATE,,11,11.0,307,171.59,,*,,*,,,,2013.0
1,1003000126,S,LEVOFLOXACIN,26.0,26,26.0,165,227.1,15.0,,15.0,,15.0,106.0,159.72,2013.0
2,1003000126,S,LISINOPRIL,17.0,19,19.0,570,100.37,,#,,#,,,,2013.0
3,1003000126,S,METOPROLOL TARTRATE,28.0,30,31.0,916,154.65,,#,,#,,,,2013.0
4,1003000126,S,PREDNISONE,14.0,14,14.0,133,44.72,,*,,*,,,,2013.0


In [12]:
description_flags = list(set(df2['description_flag'].values))
bene_count_ge65_suppress_flags = list(set(df2['bene_count_ge65_suppress_flag'].values))
ge65_suppress_flags = list(set(df2['ge65_suppress_flag'].values))
print('description_flags: ', description_flags)
print('bene_count_ge65_suppress_flags: ', bene_count_ge65_suppress_flags)
print('ge65_suppress_flags: ', ge65_suppress_flags)

description_flags:  ['T', 'S']
bene_count_ge65_suppress_flags:  [nan, '#', '*']
ge65_suppress_flags:  [nan, '#', '*']


**We're going to drop `description_flag`,`bene_count_ge65_suppress_flags`,and `ge65_suppress_flags`.**

In [13]:
df3 = df2.drop(columns=['description_flag','bene_count_ge65_suppress_flag','ge65_suppress_flag'])
df3.head()

Unnamed: 0,npi,generic_name,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,total_claim_count_ge65,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65,year
0,1003000126,ISOSORBIDE MONONITRATE,,11,11.0,307,171.59,,,,,,2013.0
1,1003000126,LEVOFLOXACIN,26.0,26,26.0,165,227.1,15.0,15.0,15.0,106.0,159.72,2013.0
2,1003000126,LISINOPRIL,17.0,19,19.0,570,100.37,,,,,,2013.0
3,1003000126,METOPROLOL TARTRATE,28.0,30,31.0,916,154.65,,,,,,2013.0
4,1003000126,PREDNISONE,14.0,14,14.0,133,44.72,,,,,,2013.0


### Let's start building the matrix
> 1. merge npi and year
> 2. build feature matrix
> 3. split feature matrix by year

In [1]:
# merging npi and year
df3['year']

NameError: name 'df3' is not defined

In [None]:
metrics = list(df2013)[2:]
metric = metrics[0]