# ML Model training and evaluation

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce
import preprocessors as pp
# import pipeline as pipe

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import config

In [2]:
train_df = pd.read_csv(config.TRAIN_DATA_PATH)
test_df = pd.read_csv(config.TEST_DATA_PATH)

In [3]:
X_train, y_train = train_df.drop(config.TARGET,axis=1), train_df[config.TARGET]
X_test, y_test = test_df.drop(config.TARGET,axis=1), test_df[config.TARGET]

In [4]:
X_train.head()

Unnamed: 0,NPI,Gender,State,Credentials,Specialty,ABILIFY,ACETAMINOPHEN.CODEINE,ACYCLOVIR,ADVAIR.DISKUS,AGGRENOX,...,VENTOLIN.HFA,VERAPAMIL.ER,VESICARE,VOLTAREN,VYTORIN,WARFARIN.SODIUM,XARELTO,ZETIA,ZIPRASIDONE.HCL,ZOLPIDEM.TARTRATE
0,1609063734,M,AZ,MD,Internal Medicine,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1699072314,F,CA,D.D.S.,Dentist,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1073576526,M,OH,DO,General Practice,0,0,0,13,0,...,0,15,0,0,0,57,0,0,0,11
3,1598830333,F,WI,M.D.,Family Practice,0,13,0,26,0,...,0,0,18,0,0,111,0,0,0,33
4,1184627390,M,FL,M.D.,Cardiology,0,0,0,0,0,...,0,15,0,0,13,313,124,78,0,0


### Categorical Encoder

In [5]:
# ohe_enc = ColumnTransformer([
#     ('OHE',OneHotEncoder(drop='if_binary',dtype='int'),config.BINARY_CAT_VARIABLES)
# ],remainder='passthrough')

In [6]:
# pipe_cat = Pipeline([
#     ('select_cat_cols',pp.FeaturesToKeep(cols=config.CAT_COLS)),
#     ('count_enc',ce.CountEncoder(cols=config.MULTI_CAT_VARIABLES ,normalize=True)),
#     ('ohe',ohe_enc)
# ])

In [7]:
# pipe_cat.fit_transform(X_train,y_train)

### Numerical Transformers

In [8]:
# pipe_num = Pipeline([
#     ('pca',pp.PCATransformer(variables=config.NUM_COLS,n_components=0.8)),
#     ('scaler',MinMaxScaler(feature_range=(-1,1)))
# ])

In [9]:
# out = pipe_num.fit_transform(X_train)

In [10]:
# out.shape

In [11]:
# out

### Main Pipeline

In [12]:
# preprocessor = FeatureUnion([
#     ('cat',pipe_cat),
#     ('num',pipe_num)
# ])

In [13]:
# preprocessor.fit_transform(X_train,y_train)

In [14]:
# pipe_main = Pipeline([
#     ('preprocessor',preprocessor),
#     ('classifier',LogisticRegression())
# ])

In [15]:
# pipe_main.fit(X_train,y_train);

In [16]:
# y_pred = pipe_main.predict(X_test)

In [17]:
# X_test.values

In [18]:
# print(classification_report(y_pred,y_test))

### Single line pipeline

In [20]:
pca_enc = pp.PCATransformer(cols=config.NUM_COLS,n_components=0.8)

In [21]:
pca_enc.fit_transform(X_train,y_train)

Unnamed: 0,NPI,Gender,State,Credentials,Specialty,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14
0,1609063734,M,AZ,MD,Internal Medicine,-120.785395,7.346413,-6.179289,24.847485,2.415422,-9.389932,-1.763559,-8.302811,-2.398511,-8.373487,-21.112739,12.210484,-0.294592,8.146047,4.582395
1,1699072314,F,CA,D.D.S.,Dentist,-154.552362,-11.938337,2.840597,3.173967,-2.977278,-9.780071,0.527900,-4.410619,1.787577,-2.824101,0.858873,8.168814,-2.468351,-0.857065,5.343977
2,1073576526,M,OH,DO,General Practice,169.489664,-43.161840,-60.421631,-9.370860,28.360406,-6.626448,58.537109,19.793902,51.722169,42.708138,-28.438872,-80.073397,40.114726,4.916226,16.238277
3,1598830333,F,WI,M.D.,Family Practice,260.361704,40.349066,26.060115,-31.274980,-34.801225,-17.397325,49.906118,15.097001,46.801253,-5.265057,50.689229,-25.994695,11.765565,-0.646844,18.024620
4,1184627390,M,FL,M.D.,Cardiology,611.677391,-445.193169,-48.150695,416.081109,468.423279,51.749981,27.214466,45.773605,-62.275995,-16.505278,10.107403,66.698790,-30.097493,10.553646,-68.962205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1437361037,M,NY,"MD, MS",Gynecological/Oncology,-142.880883,25.345619,-13.361391,14.415163,0.734601,-8.932778,1.737924,-2.386511,6.585589,-13.556997,-7.524030,0.155484,-10.775992,-12.899824,12.160506
19996,1861546038,M,WA,DDS,Dentist,-154.395432,-11.827451,2.728051,3.102199,-3.126266,-9.876671,0.514506,-4.330079,1.789078,-3.279426,0.839424,8.186904,-2.624431,-0.970808,5.602125
19997,1780856427,M,CA,M.D.,Diagnostic Radiology,-154.624237,-12.052810,2.924976,3.183481,-2.987927,-9.674182,-0.659387,-3.442254,2.331975,-2.265799,0.891795,8.161215,-2.205487,-0.747991,4.949229
19998,1811089873,M,CT,D.O.,Internal Medicine,-77.114122,-34.118641,-6.359361,-7.582830,-14.939301,-12.001344,14.162299,12.954119,-30.564731,-1.379303,-34.936371,20.075103,-23.549844,5.952001,-12.647827


In [22]:
pipe = Pipeline([
    ('drop',pp.FeaturesToDrop(config.DROP_COLS)),
#     ('ohe',pp.DummyCatEncoder(cols=config.BINARY_CAT_VARIABLES,drop_first=True)),
#     ('rare',pp.RareLabelCatEncoder(cols=config.MULTI_CAT_VARIABLES)),
    ('freq',pp.FrequencyCatEncoder(cols=config.MULTI_CAT_VARIABLES)),
#     ('pca',pp.PCATransformer(cols=config.NUM_COLS,n_components=0.8)),
#     ('scaler',MinMaxScaler((0, 100))),
#     ('clf',LogisticRegression())
])

In [23]:
# pipe.fit(X_train[config.CAT_COLS],y_train)

In [24]:
# pipe.transform(X_train[config.CAT_COLS])

In [25]:
# pipe.transform(X_train[config.CAT_COLS].sample(2))

In [26]:
# X_train[config.CAT_COLS].sample(2)

In [29]:
test = np.arange(50).reshape(-1,5)
test

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49]])

In [32]:
pd.DataFrame(test).add_prefix('t_')

Unnamed: 0,t_0,t_1,t_2,t_3,t_4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29
6,30,31,32,33,34
7,35,36,37,38,39
8,40,41,42,43,44
9,45,46,47,48,49


In [33]:
ohe = pp.OneHOtCatEncoder(cols='Gender')

In [36]:
# ohe.fit_transform(X_train,y_train)

In [56]:
ohe_sk = OneHotEncoder(sparse=False)

In [58]:
ohe_sk.fit_transform(X_train['Gender','State'].values.reshape(-1,1))#.todense()

KeyError: ('Gender', 'State')