In [1]:
import pandas as pd
import numpy as np

## Read in response variable:

In [2]:
response_variable = pd.read_csv('response_variable_v2.csv')
response_variable = response_variable.iloc[:,[0,2]]

## Matthew's features:

In [3]:
matthew_frame = pd.read_csv('features_matthew_v2.csv')

## Chris's features:

In [106]:
chris_frame_pc = pd.read_csv('pc_diagnosis.csv')

# Confirm that Matthew's model is running still:

In [17]:
df = response_variable.merge(matthew_frame, on='id')
df = df.set_index('id')

# Training/test split

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X = df.iloc[:,1:]
X = X[['SUPPLY_CNT_on_day0', 'PAYABLE_QTY_on_day0', 'total_SUPPLY_CNT_prior', 'total_PAYABLE_QTY_prior']]
y = df.iloc[:,0].map(lambda x: 1 if x == True else 0 )

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=33)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

neigh = KNeighborsClassifier(n_neighbors=135)

cv_results = cross_validate(neigh, X_train, y_train, cv=10,
                            scoring= 'roc_auc',
                            return_train_score=True,
                           return_estimator =True)

np.mean(cv_results['test_score'])

0.8665545465491604

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, max_depth=7,
                                  random_state=100)

cv_results = cross_validate(rf_model, X_train, y_train, cv=10,
                            scoring= 'roc_auc',
                            return_train_score=True,
                           return_estimator =True)

np.mean(cv_results['test_score'])

0.8693006222087076

# Let's see if adding some principal components helps at all:

In [107]:
df = response_variable.merge(matthew_frame, on='id')
df = pd.merge(df, chris_frame_pc, on = 'id', how = 'left')
df = df.set_index('id')
df

Unnamed: 0_level_0,LTOT_v2,MME_on_day0,SUPPLY_CNT_on_day0,PAYABLE_QTY_on_day0,max_MME_prior,avg_MME_prior,total_SUPPLY_CNT_prior,total_PAYABLE_QTY_prior,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,PC 8,PC 9,PC 10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ID10010854159,False,15.000000,5.0,15.0,0.000000,0.000000,0.0,0.0,0.008840,0.006021,0.001393,0.003137,-0.000543,-0.000012,-0.002715,0.000189,-0.002999,0.000369
ID10013863216,True,10.000000,90.0,180.0,0.000000,0.000000,0.0,0.0,0.004783,-0.004110,0.009211,0.006220,-0.000302,-0.001218,-0.004039,0.003508,-0.007012,0.007118
ID10024447278,True,50.000000,3.0,20.0,22.500000,20.000000,15.0,40.0,0.007123,0.018349,-0.002873,0.001940,-0.000999,0.001292,0.000691,-0.000391,-0.000094,-0.002040
ID1002482139,True,60.000000,30.0,120.0,90.000000,37.254464,789.0,2895.0,-0.003584,-0.004210,0.000235,-0.003246,-0.001197,0.002330,0.000364,-0.001483,0.000771,-0.001723
ID1003386406,False,20.000000,15.0,60.0,50.000000,50.000000,3.0,20.0,0.001259,-0.000543,-0.000644,-0.003560,-0.000509,-0.003400,-0.002201,-0.008570,0.005852,0.004095
ID10036289882,False,37.500000,2.0,15.0,0.000000,0.000000,0.0,0.0,0.003902,-0.002276,-0.000546,-0.004279,-0.000953,-0.002704,-0.002725,-0.001655,-0.000988,-0.005930
ID10036954381,False,37.500000,2.0,15.0,37.500000,37.500000,2.0,15.0,0.000976,0.008584,-0.000164,0.000007,-0.000959,-0.001393,-0.002775,-0.001066,-0.001588,0.001001
ID10055739763,True,15.000000,5.0,15.0,30.000000,30.000000,3.0,12.0,0.004713,-0.002113,-0.001403,-0.003611,-0.000780,-0.000573,-0.000172,-0.001857,-0.000591,-0.003302
ID10074598346,False,20.000000,5.0,20.0,40.000000,25.842105,452.0,1480.0,-0.004916,0.011452,0.001970,0.003042,-0.001399,-0.001734,0.001961,0.003397,-0.001111,0.002531
ID10081072715,False,20.000000,5.0,20.0,0.000000,0.000000,0.0,0.0,0.013586,-0.001153,0.001005,-0.001205,-0.001093,0.003262,0.001149,-0.001513,-0.003435,0.000367


In [17]:
#do this to just look at PCs

#df = pd.concat([pd.DataFrame(df.iloc[:,0]), df.iloc[:,8:]], axis = 1)

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate


for col in df.select_dtypes(include=np.number):
    df[col] = df[col].fillna(df[col].median())
    
X = df.iloc[:,1:]
y = df.iloc[:,0].map(lambda x: 1 if x == True else 0 )

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=33)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=10,
                                  random_state=100)

cv_results = cross_validate(rf_model, X_train, y_train, cv=3,
                            scoring= 'roc_auc',
                            return_train_score=True,
                           return_estimator =True)

np.mean(cv_results['test_score'])

0.8887973357319533

In [26]:
df

Unnamed: 0_level_0,LTOT_v2,MME_on_day0,SUPPLY_CNT_on_day0,PAYABLE_QTY_on_day0,max_MME_prior,avg_MME_prior,total_SUPPLY_CNT_prior,total_PAYABLE_QTY_prior,PC 1,PC 2,...,PC 41,PC 42,PC 43,PC 44,PC 45,PC 46,PC 47,PC 48,PC 49,PC 50
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID10010854159,False,15.000000,5.0,15.0,0.000000,0.000000,0.0,0.0,-0.008862,-0.001108,...,-0.004087,-2.172048e-03,-0.003111,6.529389e-03,-0.001689,0.001002,0.004276,0.005178,0.010506,0.000314
ID10013863216,True,10.000000,90.0,180.0,0.000000,0.000000,0.0,0.0,-0.009147,-0.001073,...,-0.000134,-2.228937e-05,-0.000156,-3.395841e-07,0.000093,-0.000079,0.000096,0.000100,-0.000168,-0.000116
ID10024447278,True,50.000000,3.0,20.0,22.500000,20.000000,15.0,40.0,-0.009147,-0.001073,...,-0.000134,-2.228937e-05,-0.000156,-3.395841e-07,0.000093,-0.000079,0.000096,0.000100,-0.000168,-0.000116
ID1002482139,True,60.000000,30.0,120.0,90.000000,37.254464,789.0,2895.0,-0.011015,0.008488,...,0.000005,-4.717781e-05,-0.000788,2.667867e-04,0.000136,-0.000408,0.000296,0.000058,-0.000528,-0.000855
ID1003386406,False,20.000000,15.0,60.0,50.000000,50.000000,3.0,20.0,-0.009147,-0.001073,...,-0.000134,-2.228937e-05,-0.000156,-3.395841e-07,0.000093,-0.000079,0.000096,0.000100,-0.000168,-0.000116
ID10036289882,False,37.500000,2.0,15.0,0.000000,0.000000,0.0,0.0,-0.009147,-0.001073,...,-0.000134,-2.228937e-05,-0.000156,-3.395841e-07,0.000093,-0.000079,0.000096,0.000100,-0.000168,-0.000116
ID10036954381,False,37.500000,2.0,15.0,37.500000,37.500000,2.0,15.0,-0.009564,-0.001091,...,0.000241,-3.760716e-04,0.000739,-1.394407e-04,0.000259,-0.000897,-0.000636,0.001880,-0.002117,-0.000102
ID10055739763,True,15.000000,5.0,15.0,30.000000,30.000000,3.0,12.0,-0.008927,-0.001142,...,-0.002255,-1.640854e-03,-0.004180,-1.772533e-03,0.004339,0.000562,0.001597,0.007450,-0.018454,-0.014735
ID10074598346,False,20.000000,5.0,20.0,40.000000,25.842105,452.0,1480.0,-0.009202,-0.001098,...,-0.000421,2.571476e-05,0.000161,-6.360881e-05,-0.000134,0.000181,-0.000145,0.000096,0.000562,0.000121
ID10081072715,False,20.000000,5.0,20.0,0.000000,0.000000,0.0,0.0,0.192232,0.001349,...,0.000014,7.029955e-05,0.000029,4.600920e-05,-0.000044,0.000199,-0.000112,0.000171,0.000020,-0.000116


Adding some principal components adds a tiny bit of accuracy, but not worth it at this point (probably within margin of error). I would like to try to retool and do better, but regradless it seems to be working well. 