In [1]:
# create a dataframe
import boto3
import io
import pandas as pd
import time
from io import StringIO

In [3]:
def process_partD(path, year):
    '''
    Input: source path to raw file and year
    Process: import table as a pd.DatafFrame and add `year` column
    Output: pd.DatafFrame
    '''
    chunksize = 10 ** 6
    chunks_df = []
    for chunk in pd.read_csv(path, sep="\t", low_memory=False, chunksize=chunksize):
        # append a year column
        chunk['year'] = pd.Series([year] * len(chunk))
        chunks_df.append(chunk)
    df = pd.concat(chunks_df, axis=0)
    return df

In [5]:
s3 = boto3.client('s3')
bucket='dast1healthcare' # Or whatever you called your bucket

path_tuples = [("PartD_Prescriber_PUF_NPI_Drug_13.txt", 2013),
               ("PartD_Prescriber_PUF_NPI_Drug_14.txt", 2014),
               ("PartD_Prescriber_PUF_NPI_Drug_15.txt", 2015),
               ("PartD_Prescriber_PUF_NPI_Drug_16.txt", 2016)]

df_list = []
for path in path_tuples:
    data_key = path[0] # Where the file is within your bucket
    data_location = 's3://{}/{}'.format(bucket, data_key)
    df_list.append(process_partD(data_location, path[1]))
df = pd.concat(df_list, axis=0)

In [None]:
# save the huge merged part D data frame
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'medicare_part_d.csv').put(Body=csv_buffer.getvalue())

In [75]:
# find unique ids
unique_id = np.array(list(set(zip(df["npi"].values,
                                  df["nppes_provider_last_org_name"].values, 
                                  df["nppes_provider_first_name"].values,
                                  df["specialty_description"].values,
                                  df["nppes_provider_city"].values,
                                  df["nppes_provider_state"].values)))).T
id_df = pd.DataFrame(unique_id.T, columns=["npi","last/org name","first name","specialty","city","state"])

In [82]:
# save unique ids file
csv_buffer = StringIO()
id_df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'identification_part_d.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '5D5B888591494B16',
  'HostId': '5njqxaJoQqCGBl4z1/dmlrn6lsHw0Hm2PCXwwLYQCymC2UEFDevALVcw3cELu+G8AhecS48u5Ow=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '5njqxaJoQqCGBl4z1/dmlrn6lsHw0Hm2PCXwwLYQCymC2UEFDevALVcw3cELu+G8AhecS48u5Ow=',
   'x-amz-request-id': '5D5B888591494B16',
   'date': 'Tue, 07 Aug 2018 23:59:33 GMT',
   'etag': '"6bc9ebc9d5b047ee1a2051a27783fa03"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"6bc9ebc9d5b047ee1a2051a27783fa03"'}