In [1]:
import pandas as pd
import numpy as np

## Read in response variable:

In [2]:
response_variable = pd.read_csv('response_variable_v2.csv')
response_variable = response_variable.iloc[:,[0,2]]

## Matthew's features:

In [3]:
matthew_frame = pd.read_csv('features_matthew_v2.csv')

## Chris's features:

In [14]:
chris_frame_pc = pd.read_csv('pc_diagnosis.csv')

# Confirm that Matthew's model is running still:

In [17]:
df = response_variable.merge(matthew_frame, on='id')
df = df.set_index('id')

# Training/test split

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X = df.iloc[:,1:]
X = X[['SUPPLY_CNT_on_day0', 'PAYABLE_QTY_on_day0', 'total_SUPPLY_CNT_prior', 'total_PAYABLE_QTY_prior']]
y = df.iloc[:,0].map(lambda x: 1 if x == True else 0 )

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=33)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

neigh = KNeighborsClassifier(n_neighbors=135)

cv_results = cross_validate(neigh, X_train, y_train, cv=10,
                            scoring= 'roc_auc',
                            return_train_score=True,
                           return_estimator =True)

np.mean(cv_results['test_score'])

0.8665545465491604

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, max_depth=7,
                                  random_state=100)

cv_results = cross_validate(rf_model, X_train, y_train, cv=10,
                            scoring= 'roc_auc',
                            return_train_score=True,
                           return_estimator =True)

np.mean(cv_results['test_score'])

0.8693006222087076

# Let's see if adding some principal components helps at all:

In [73]:
df = response_variable.merge(matthew_frame, on='id')
df = pd.merge(df, chris_frame_pc, on = 'id', how = 'left')
df = df.set_index('id')

In [74]:
#restrict to first four PCs
df = df.iloc[:,:-46]
df

Unnamed: 0_level_0,LTOT_v2,MME_on_day0,SUPPLY_CNT_on_day0,PAYABLE_QTY_on_day0,max_MME_prior,avg_MME_prior,total_SUPPLY_CNT_prior,total_PAYABLE_QTY_prior,PC 1,PC 2,PC 3,PC 4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ID10010854159,False,15.000000,5.0,15.0,0.000000,0.000000,0.0,0.0,-0.008862,-0.001108,-0.009221,-0.006047
ID10013863216,True,10.000000,90.0,180.0,0.000000,0.000000,0.0,0.0,,,,
ID10024447278,True,50.000000,3.0,20.0,22.500000,20.000000,15.0,40.0,,,,
ID1002482139,True,60.000000,30.0,120.0,90.000000,37.254464,789.0,2895.0,-0.011015,0.008488,-0.002925,-0.006238
ID1003386406,False,20.000000,15.0,60.0,50.000000,50.000000,3.0,20.0,,,,
ID10036289882,False,37.500000,2.0,15.0,0.000000,0.000000,0.0,0.0,,,,
ID10036954381,False,37.500000,2.0,15.0,37.500000,37.500000,2.0,15.0,-0.009564,-0.001091,-0.011219,-0.007729
ID10055739763,True,15.000000,5.0,15.0,30.000000,30.000000,3.0,12.0,-0.008927,-0.001142,-0.009317,-0.006180
ID10074598346,False,20.000000,5.0,20.0,40.000000,25.842105,452.0,1480.0,-0.009202,-0.001098,-0.009828,-0.006526
ID10081072715,False,20.000000,5.0,20.0,0.000000,0.000000,0.0,0.0,0.192232,0.001349,0.010324,0.003655


In [66]:
#do this to just look at PCs

#df = pd.concat([pd.DataFrame(df.iloc[:,0]), df.iloc[:,8:10]], axis = 1)

In [75]:
for col in df.select_dtypes(include=np.number):
    df[col] = df[col].fillna(df[col].median())
    
X = df.iloc[:,1:]
y = df.iloc[:,0].map(lambda x: 1 if x == True else 0 )

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=33)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=10,
                                  random_state=100)

cv_results = cross_validate(rf_model, X_train, y_train, cv=3,
                            scoring= 'roc_auc',
                            return_train_score=True,
                           return_estimator =True)

np.mean(cv_results['test_score'])

0.8714024371081931

In [26]:
df

Unnamed: 0_level_0,LTOT_v2,MME_on_day0,SUPPLY_CNT_on_day0,PAYABLE_QTY_on_day0,max_MME_prior,avg_MME_prior,total_SUPPLY_CNT_prior,total_PAYABLE_QTY_prior,PC 1,PC 2,...,PC 41,PC 42,PC 43,PC 44,PC 45,PC 46,PC 47,PC 48,PC 49,PC 50
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID10010854159,False,15.000000,5.0,15.0,0.000000,0.000000,0.0,0.0,-0.008862,-0.001108,...,-0.004087,-2.172048e-03,-0.003111,6.529389e-03,-0.001689,0.001002,0.004276,0.005178,0.010506,0.000314
ID10013863216,True,10.000000,90.0,180.0,0.000000,0.000000,0.0,0.0,-0.009147,-0.001073,...,-0.000134,-2.228937e-05,-0.000156,-3.395841e-07,0.000093,-0.000079,0.000096,0.000100,-0.000168,-0.000116
ID10024447278,True,50.000000,3.0,20.0,22.500000,20.000000,15.0,40.0,-0.009147,-0.001073,...,-0.000134,-2.228937e-05,-0.000156,-3.395841e-07,0.000093,-0.000079,0.000096,0.000100,-0.000168,-0.000116
ID1002482139,True,60.000000,30.0,120.0,90.000000,37.254464,789.0,2895.0,-0.011015,0.008488,...,0.000005,-4.717781e-05,-0.000788,2.667867e-04,0.000136,-0.000408,0.000296,0.000058,-0.000528,-0.000855
ID1003386406,False,20.000000,15.0,60.0,50.000000,50.000000,3.0,20.0,-0.009147,-0.001073,...,-0.000134,-2.228937e-05,-0.000156,-3.395841e-07,0.000093,-0.000079,0.000096,0.000100,-0.000168,-0.000116
ID10036289882,False,37.500000,2.0,15.0,0.000000,0.000000,0.0,0.0,-0.009147,-0.001073,...,-0.000134,-2.228937e-05,-0.000156,-3.395841e-07,0.000093,-0.000079,0.000096,0.000100,-0.000168,-0.000116
ID10036954381,False,37.500000,2.0,15.0,37.500000,37.500000,2.0,15.0,-0.009564,-0.001091,...,0.000241,-3.760716e-04,0.000739,-1.394407e-04,0.000259,-0.000897,-0.000636,0.001880,-0.002117,-0.000102
ID10055739763,True,15.000000,5.0,15.0,30.000000,30.000000,3.0,12.0,-0.008927,-0.001142,...,-0.002255,-1.640854e-03,-0.004180,-1.772533e-03,0.004339,0.000562,0.001597,0.007450,-0.018454,-0.014735
ID10074598346,False,20.000000,5.0,20.0,40.000000,25.842105,452.0,1480.0,-0.009202,-0.001098,...,-0.000421,2.571476e-05,0.000161,-6.360881e-05,-0.000134,0.000181,-0.000145,0.000096,0.000562,0.000121
ID10081072715,False,20.000000,5.0,20.0,0.000000,0.000000,0.0,0.0,0.192232,0.001349,...,0.000014,7.029955e-05,0.000029,4.600920e-05,-0.000044,0.000199,-0.000112,0.000171,0.000020,-0.000116


Adding some principal components adds a tiny bit of accuracy, but not worth it at this point (probably within margin of error). I would like to try to retool and do better, but regradless it seems to be working well. 