In [1]:
import pandas as pd
#pd.set_option('display.max_columns', None)
import sklearn
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn import metrics
import numpy as np
import os

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)


In [2]:
import json
dataset_filename = 'data.jsonl'
def iter_dataset():
    with open(dataset_filename, 'rt') as f:
        for line in f:
            ex = json.loads(line)
            yield (ex['cms_prescription_counts'],
                   ex['provider_variables'])

def merge_dicts(*dicts: dict):
    merged_dict = dict()
    for dictionary in dicts:
        merged_dict.update(dictionary)
    return merged_dict

data = [merge_dicts(x, y) for x, y in iter_dataset()]


In [3]:
df = pd.DataFrame(data)
df.fillna(0, inplace=True)

df.drop(columns='gender', inplace=True)
df.drop(columns='region', inplace=True)
df.drop(columns='settlement_type', inplace=True)
df.drop(columns='years_practicing', inplace=True)

drugName = "specialty-Pulmonary Diagnostics"

encode_text_dummy(df, 'specialty')

df.head()

Unnamed: 0,1ST TIER UNIFINE PENTIPS,ABACAVIR,ABELCET,ABILIFY,ABILIFY DISCMELT,ABILIFY MAINTENA,ABRAXANE,ABSTRAL,ACAMPROSATE CALCIUM,ACANYA,...,specialty-Vascular & Interventional Radiology,specialty-Vascular Neurology,specialty-Vascular Sonography,specialty-Vascular Specialist,specialty-Vascular Surgery,specialty-Vision Therapy,specialty-Women,specialty-Women's Health,"specialty-Women's Health Care, Ambulatory",specialty-Wound Care
0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Encode to a 2D matrix for training
x,y = to_xy(df, drugName)

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=45) 

#regressor = LinearRegression()
regressor = LinearRegression()

# Fit/train linear regression
regressor.fit(x_train,y_train)

# Predict
pred = regressor.predict(x_test)

# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y_test))

print("Final score (RMSE): {}".format(score))

Final score (RMSE): 0.0001403924252372235


In [5]:
names = list(df.columns.values)
print(names)

['1ST TIER UNIFINE PENTIPS', 'ABACAVIR', 'ABELCET', 'ABILIFY', 'ABILIFY DISCMELT', 'ABILIFY MAINTENA', 'ABRAXANE', 'ABSTRAL', 'ACAMPROSATE CALCIUM', 'ACANYA', 'ACARBOSE', 'ACCOLATE', 'ACCUNEB', 'ACCUPRIL', 'ACEBUTOLOL HCL', 'ACETAMINOPH-CAFF-DIHYDROCODEIN', 'ACETAMINOPHEN-BUTALBITAL', 'ACETAMINOPHEN-CODEINE', 'ACETAZOLAMIDE', 'ACETIC ACID', 'ACETIC ACID-ALUMINUM', 'ACETYLCYSTEINE', 'ACIPHEX', 'ACITRETIN', 'ACTEMRA', 'ACTIGALL', 'ACTIMMUNE', 'ACTIQ', 'ACTIVELLA', 'ACTONEL', 'ACTOPLUS MET', 'ACTOPLUS MET XR', 'ACTOS', 'ACYCLOVIR', 'ACZONE', 'ADACEL TDAP', 'ADALAT CC', 'ADAPALENE', 'ADCIRCA', 'ADDERALL', 'ADDERALL XR', 'ADRUCIL', 'ADVAIR DISKUS', 'ADVAIR HFA', 'ADVICOR', 'ADVOCATE PEN NEEDLES', 'AFEDITAB CR', 'AFINITOR', 'AGGRENOX', 'AGRYLIN', 'AIMSCO ULTRA THIN II', 'AK-POLY-BAC', 'ALBENZA', 'ALBUTEIN', 'ALBUTEROL SULFATE', 'ALCLOMETASONE DIPROPIONATE', 'ALCOHOL PADS', 'ALCOHOL PREP PADS', 'ALCOHOL PREP SWABS', 'ALCOHOL SWAB', 'ALCOHOL SWABS', 'ALCOHOL WIPES', 'ALDACTAZIDE', 'ALDACTONE',

In [6]:
regressor.coef_

array([ 6.1351204e-07,  1.1510107e-06, -7.2696034e-06, ...,
       -1.0019116e-03, -8.5179921e-04,  2.6145394e-04], dtype=float32)

In [7]:
regressor.intercept_

0.0010049464

In [8]:
%matplotlib inline    
from IPython.display import display   

names.remove(drugName)


def report_coef(names,coef,intercept):
    r = pd.DataFrame( { 'coef': coef, 'positive': coef>0.4  }, index = names )
    r = r.sort_values(by=['coef'])
    
    badRows = r[(r['positive'] == False)].index
    #badCoef = r[(r['coef'] <= 1.00e-02)].index
    r.drop(badRows, inplace=True)
    #r.drop(badCoef, inplace=True)
    
    display(r)
    print("Intercept: {}".format(intercept))
    r['coef'].plot(kind='barh', color=r['positive'].map({True: 'b', False: 'r'}))

In [9]:
report_coef(
  names,
  (regressor.coef_ * 1.9),
  regressor.intercept_)

Unnamed: 0,coef,positive


Intercept: 0.001004946418106556


TypeError: Empty 'DataFrame': no numeric data to plot