# Reduce dimensions
---
> 1. Load and prep data
> 2. Focus on prescriptions that make up a significant annual dollar volume per prescriber
> 3. Focus on states with highest exclusion rates


In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from multiprocessing import Pool
!pip install tqdm
from tqdm import tqdm_notebook
import itertools

# import own scripts
import sys
sys.path.insert(0, '/healthcare-fraud/src/')
import scripts as src

Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/7d/e6/19dfaff08fcbee7f3453e5b537e65a8364f1945f921a36d08be1e2ff3475/tqdm-4.24.0-py2.py3-none-any.whl (43kB)
[K    100% |████████████████████████████████| 51kB 13.6MB/s ta 0:00:01
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.24.0
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# import data
df_id = src.read_from_efs('df_id.csv')
X = src.read_from_efs('X.csv')
y = src.read_from_efs('y.csv', header=None)

  mask |= (ar1 == a)


In [3]:
print('df_id shape: ', df_id.shape)
df_id.head()

df_id shape:  (90624128, 7)


Unnamed: 0,year,npi,nppes_provider_last_org_name,nppes_provider_first_name,specialty_description,nppes_provider_city,nppes_provider_state
0,2013,1003000126,ENKESHAFI,ARDALAN,Internal Medicine,CUMBERLAND,MD
1,2013,1003000126,ENKESHAFI,ARDALAN,Internal Medicine,CUMBERLAND,MD
2,2013,1003000126,ENKESHAFI,ARDALAN,Internal Medicine,CUMBERLAND,MD
3,2013,1003000126,ENKESHAFI,ARDALAN,Internal Medicine,CUMBERLAND,MD
4,2013,1003000126,ENKESHAFI,ARDALAN,Internal Medicine,CUMBERLAND,MD


In [4]:
print('X shape: ', X.shape)
X.head()

X shape:  (90624128, 15)


Unnamed: 0,generic_name,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,total_claim_count_ge65,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65,total_drug_cost_per_beneficiary,total_drug_cost_per_claim,total_drug_cost_per_beneficiary_ge65,total_drug_cost_per_claim_ge65
0,ISOSORBIDE MONONITRATE,0.0,11,11.0,307,171.59,0.0,0.0,0.0,0.0,0.0,inf,15.599091,,
1,LEVOFLOXACIN,26.0,26,26.0,165,227.1,15.0,15.0,15.0,106.0,159.72,8.734615,8.734615,10.648,10.648
2,LISINOPRIL,17.0,19,19.0,570,100.37,0.0,0.0,0.0,0.0,0.0,5.904118,5.282632,,
3,METOPROLOL TARTRATE,28.0,30,31.0,916,154.65,0.0,0.0,0.0,0.0,0.0,5.523214,5.155,,
4,PREDNISONE,14.0,14,14.0,133,44.72,0.0,0.0,0.0,0.0,0.0,3.194286,3.194286,,


In [5]:
print('y shape: ', y.shape)
y.head()

y shape:  (90624128, 1)


Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0,False
1,False
2,False
3,False
4,False


## Compare between `excluded` and  `not excluded` prescribers `apples to apples`
---
> 1. consider records only by common `specialty_description`and
> 2. by common `generic_name`

### Start w/ `specialties_decription`
---

In [29]:
# build out `excluded` vs `not_excluded`
df_id['excluded'] = y.values
df_excl = df_id[df_id['excluded']]
df_ok = df_id[~df_id['excluded']]

# group by `specialty`
spec_excl = df_excl.groupby('specialty_description').agg('count').sort_values(by = 'npi', ascending = False).reset_index()
spec_ok = df_ok.groupby('specialty_description').agg('count').sort_values(by = 'npi', ascending = False).head(10)['npi'].reset_index()

# find common specialties
common_sepcialties = list(set(spec_excl['specialty_description'].tolist()).intersection(set(spec_ok['specialty_description'].tolist())))

# reduce data sets by common specialties
df_id_spec = df_id[df_id['specialty_description'].isin(common_sepcialties)]
X_spec = X[df_id['specialty_description'].isin(common_sepcialties)]

In [None]:
# merge dataframes
df = pd.concat([df_id_spec, X_spec], axis=1)

# clear memory
del df_id, X, y, df_excl, df_ok, spec_excl, spec_ok, common_sepcialties, df_id_spec, X_spec

### Now, reduce by `generic_name`
---

In [56]:
# find common `generic_names`
common_generic_names = set(df['generic_name'][df['excluded']]).intersection(set(df['generic_name'][~df['excluded']]))

In [None]:
# df = src.read_from_efs('df_spec.csv')

In [None]:
df_clean = df[df['generic_name'].isin(common_generic_names)].reset_index(drop=True)
print('df_clean shape: ', df_clean.shape)
df_clean.head()
src.save_to_efs(df_clean, 'df_clean.csv')