# Purpose
The purpose of this file is to clean and manage data the 2008-2010 Medicare Claims Synthetic Public Use Files (SynPUFs).

# Setup
First, we'll import the packages we'll need for data management tasks.

In [None]:
# import packages
import pandas as pd
import numpy as np

Next, we will load the Beneficiary Summary files () into Python. Each file is separated by year (2008, 2009, 2010).

In [None]:
# load the Beneficiary Summary files
demo_08 = pd.read_csv("data/2008_Beneficiary_Summary_File_Sample_1.csv")
demo_09 = pd.read_csv("data/2009_Beneficiary_Summary_File_Sample_1.csv")
demo_10 = pd.read_csv("data/2010_Beneficiary_Summary_File_Sample_1.csv")

Now we'll examine summary information about the datasets, including the number of rows/columns, the data types of each variable, and the first few rows of each dataset.

In [None]:
# Create a data dictionary with the Beneficiary Summary File names
datasets = {
    "2008 Beneficiary Summary File": demo_08,
    "2009 Beneficiary Summary File": demo_09,
    "2010 Beneficiary Summary File": demo_10
}

# Loop through each Beneficary Summary file to summarize the number of rows/columns.
for name, data in datasets.items():
    rows, cols = data.shape
    print(f"Dataset name: {name}")
    print(f"Number of rows: {rows}")
    print(f"Number of columns: {cols}")
    print("Variable (column) names:", data.columns.tolist())
    print("") # add an extra space between output to improve clarity

Now we want to combine the three Beneficiary Summary files into a single file.

In [None]:
# first, create an indicator for the year
for name, data in datasets.items():
    # extract the year value as the first 4 digits
    data["year"] = name[0:4]

In [None]:
demo_08.head()

In [None]:
categorical_vars = {
    "BENE_RACE_CD": "Race/ethnicity",
    "BENE_SEX_IDENT_CD": "Sex",
    "SP_STATE_CODE": "US State FIPS Code"
}

for var, label in categorical_vars.items():
    summary = demo_08[var].value_counts().sort_index()  # sort by category label
    print(f"{label} ({var}):")
    print(summary)
    print("\n")