# ML Model training and evaluation

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce
import preprocessors as pp
# import pipeline as pipe

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import config

In [2]:
train_df = pd.read_csv(config.TRAIN_DATA_PATH)
test_df = pd.read_csv(config.TEST_DATA_PATH)

In [3]:
X_train, y_train = train_df.drop(config.TARGET,axis=1), train_df[config.TARGET]
X_test, y_test = test_df.drop(config.TARGET,axis=1), test_df[config.TARGET]

In [4]:
X_train.head()

Unnamed: 0,NPI,Gender,State,Credentials,Specialty,ABILIFY,ACETAMINOPHEN.CODEINE,ACYCLOVIR,ADVAIR.DISKUS,AGGRENOX,...,VENTOLIN.HFA,VERAPAMIL.ER,VESICARE,VOLTAREN,VYTORIN,WARFARIN.SODIUM,XARELTO,ZETIA,ZIPRASIDONE.HCL,ZOLPIDEM.TARTRATE
0,1609063734,M,AZ,MD,Internal Medicine,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1699072314,F,CA,D.D.S.,Dentist,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1073576526,M,OH,DO,General Practice,0,0,0,13,0,...,0,15,0,0,0,57,0,0,0,11
3,1598830333,F,WI,M.D.,Family Practice,0,13,0,26,0,...,0,0,18,0,0,111,0,0,0,33
4,1184627390,M,FL,M.D.,Cardiology,0,0,0,0,0,...,0,15,0,0,13,313,124,78,0,0


### Building Pipeline

In [5]:
pipe = Pipeline([
    ('drop',pp.FeaturesToDrop(config.DROP_COLS)),
    ('ohe',pp.OneHotCatEncoder(cols=config.BINARY_CAT_VARIABLES,drop='if_binary')),
    ('rare',pp.RareLabelCatEncoder(cols=config.MULTI_CAT_VARIABLES)),
    ('freq',pp.FrequencyCatEncoder(cols=config.MULTI_CAT_VARIABLES)),
    ('pca',pp.PCATransformer(cols=config.NUM_COLS,n_components=0.8)),
    ('scaler',MinMaxScaler((0, 100))),
    ('clf',GradientBoostingClassifier())
])

In [6]:
pipe.fit(X_train,y_train)

  return f(**kwargs)


Pipeline(steps=[('drop', FeaturesToDrop()), ('ohe', OneHotCatEncoder()),
                ('rare', RareLabelCatEncoder()),
                ('freq', FrequencyCatEncoder()), ('pca', PCATransformer()),
                ('scaler', MinMaxScaler(feature_range=(0, 100))),
                ('clf', GradientBoostingClassifier())])

In [7]:
pipe.predict(X_test.sample(5,random_state=1))

array([1, 0, 0, 0, 1])

In [8]:
y_test.sample(5,random_state=1).reset_index(drop=True)

Unnamed: 0,Opioid.Prescriber
0,1
1,0
2,0
3,0
4,1
