# Reduce dimensions
---
> 1. Load and prep data
> 2. Focus on prescriptions that make up a significant annual dollar volume per prescriber
> 3. Focus on states with highest exclusion rates


In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from multiprocessing import Pool
!pip install tqdm
from tqdm import tqdm_notebook
import itertools

# import own scripts
import sys
sys.path.insert(0, '/healthcare-fraud/src/')
import scripts as src

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
df = src.read_from_efs('df_trgt.csv')
print('df shape: ', LEIE.shape)
excl_npi = LEIE['npi'].drop_duplicates().tolist()

In [None]:
df = src.read_from_efs('df.csv')
print('df shape: ', df.shape)
df.head()

In [None]:
npi = df['npi'].drop_duplicates().tolist()

In [None]:
set(excl_npi).intersection(set(npi))

In [None]:
# import data
df_clean = src.read_from_efs('df_clean.csv')

print('df_clean shape: ', df_clean.shape)
df_clean.head()

In [None]:
npi_excl = df_clean['npi'][df_clean['excluded']].drop_duplicates()
npi_ok = df_clean['npi'][~df_clean['excluded']].drop_duplicates()
print(len(npi_excl), len(npi_ok))

## Exclude extremely rare prescriptions
---
> It's reasonable to use `total_30_day_fill_count` and `total_30_day_fill_count_ge65`

In [None]:
# clean nans and infs and convert to values
X = X.fillna(0).replace(np.inf, 0)

generic_name_cnt = X.groupby('generic_name').agg('sum').reset_index()[['generic_name',
                                                                       'total_30_day_fill_count',
                                                                       'total_30_day_fill_count_ge65']]

generic_name_cnt['sum'] = generic_name_cnt['total_30_day_fill_count'] + generic_name_cnt['total_30_day_fill_count_ge65']

generic_name_cnt.sort_values(by='sum', ascending=False, inplace=True)
generic_name_cnt.reset_index(drop=True, inplace=True)
print('generic_name_cnt shape: ', generic_name_cnt.shape)
generic_name_cnt.head()

In [None]:
data = generic_name_cnt['sum']
threshold = 250
src.log_plot(data, threshold,'Prescription count `generic_name` in descending order', \
         'Prescription Count', 'generic_name', logx=True, logy=True, figsize=(8,5))
plt.show()

In [None]:
target_drugs = generic_name_cnt['generic_name'][:threshold]

# get indeces for target_drugs
def find_drug_index(drug):
    return X[X['generic_name']==drug].index.tolist()

In [None]:
print("Number of available CPU cores: ")
!cat /proc/cpuinfo | grep processor | wc -l

In [None]:
# find target indeces for target drugs using multiprocessing
processors = 64
if __name__ == '__main__':
    tasks = target_drugs
    function = find_drug_index
    with Pool(processes=processors)as p:
        results = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))
        
trgt_idx = list(itertools.chain(*results))

In [None]:
# rebuild data with target `generic_names`
df_id_250drugs = df_id.iloc[trgt_idx].reset_index(drop=True)
print("df_id_250drugs shape: ", df_id_250drugs.shape)
df_id_250drugs.head()

In [None]:
X_250drugs = X.iloc[trgt_idx].reset_index(drop=True)
print("X_250drugs shape: ", X_250drugs.shape)

y_250drugs = y.iloc[trgt_idx].reset_index(drop=True)
print("y_250drugs shape: ", y_250drugs.shape)

In [None]:
df_id_250drugs = df_id_250drugs.reset_index(drop=True)

In [None]:
X_250drugs = X_250drugs.reset_index(drop=True)
y_250drugs = y_250drugs.reset_index(drop=True)

### **Label Data (y)**
---

In [None]:
# print('y shape: ', y.shape)
# y.head()
del df_id, X, y

## Focus on states with highest exclusion rates
---

In [None]:
# add label
df_id_250drugs['exclusion'] = y_250drugs

# count `exclusion by state`
exclusions_by_state = df_id_250drugs[['nppes_provider_state','exclusion']].groupby('nppes_provider_state').agg('sum')\
                                                             .sort_values('exclusion', ascending=False).astype(int)

# count Medicare Part D `providers by state`
providers_by_state = df_id_250drugs[['nppes_provider_state','npi']].groupby('nppes_provider_state').agg('count')\
                                                             .sort_values('npi', ascending=False).astype(int)

# join `exclusion by state` and `providers by state`
counts_by_state = providers_by_state.join(exclusions_by_state)

# calculate `exclusion ratio`
counts_by_state['exclusion_ratio'] = counts_by_state['exclusion']/counts_by_state['npi']
counts_by_state.sort_values('exclusion_ratio', ascending=False, inplace=True)
counts_by_state.head()

In [None]:
# Build `state_ratio_df` with cummulatives
state_ratio_df = pd.DataFrame()
state_ratio_df['cumm_provider_cnt'] = counts_by_state['npi'].cumsum()
state_ratio_df['cumm_exclusion_cnt'] = counts_by_state['exclusion'].cumsum()
state_ratio_df['exclusion_ratio'] = state_ratio_df['cumm_exclusion_cnt']/state_ratio_df['cumm_provider_cnt']

In [None]:
# Plot sorted cumulative chart by Exclusion Ratio
threshold_value=20
src.plot_multi(state_ratio_df, threshold_value, 'States', figsize=(8,5), title='Exclusion ratio sorted by State')
plt.show()

In [None]:
target_states = state_ratio_df.index[:threshold_value].tolist()
print('Top states by exclusion ratio: ', top_states)

In [None]:
# get indeces for target_drugs
def find_state_index(state):
    return df_id_250drugs[df_id_250drugs['nppes_provider_state']==state].index.tolist()

In [None]:
# find target indeces for target states using multiprocessing
processors = 64
if __name__ == '__main__':
    tasks = target_states
    function = find_state_index
    with Pool(processes=processors)as p:
        results = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))
        
trgt_idx = list(itertools.chain(*results))

In [None]:
# reduce data sets by top states
df_id_reduced = df_id_250drugs.iloc[trgt_idx].reset_index(drop=True)
X_reduced = X_250drugs.iloc[trgt_idx].reset_index(drop=True)
y_reduced = y_250drugs.iloc[trgt_idx].reset_index(drop=True)

In [None]:
# Remove 'exclusion' from df_id
df_id_reduced.drop('exclusion', axis=1, inplace=True)

## Results:
---

### Reduced Identification Data (df_id)
> for consistancy remove `exclusion` column as a separate `label` data

In [None]:
print('df_id_reduced shape: ', df_id_reduced.shape)
df_id_reduced.head()

#### Reduced Feature Matrix (X)
---

In [None]:
print('X_reduced shape: ', X_reduced.shape)
X_reduced.head()

#### Reduced Label Data `y_reduced`
---

In [None]:
print('y_reduced shape: ', y_reduced.shape)
y_reduced.head()

### Save to efs

In [None]:
src.save_to_efs(y_reduced,'y_reduced.csv')
src.save_to_efs(X_reduced,'X_reduced.csv')
src.save_to_efs(df_id_reduced, 'df_id_reduced.csv')