In [1]:
import sys
import os
sys.path.append(os.getcwd()[:-8])

In [2]:
import pandas as pd
import numpy as np
import time
import warnings
import shap
from ACME.ACME import ACME
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
        
dataframe = pd.DataFrame(X, columns=boston.feature_names)
dataframe['target'] = y

In [5]:
models = {}
models['linear_regression'] = LinearRegression().fit(X,y)
models['random_forest_regressor'] = RandomForestRegressor().fit(X,y)
models['cat_boost_regressor'] = CatBoostRegressor().fit(X,y)
models['svr'] = SVR().fit(X,y) 

Learning rate set to 0.036765
0:	learn: 8.9985824	total: 55.2ms	remaining: 55.1s
1:	learn: 8.8042315	total: 57.1ms	remaining: 28.5s
2:	learn: 8.5941862	total: 58.8ms	remaining: 19.5s
3:	learn: 8.4176957	total: 60.6ms	remaining: 15.1s
4:	learn: 8.2604708	total: 63.1ms	remaining: 12.5s
5:	learn: 8.1151947	total: 64.4ms	remaining: 10.7s
6:	learn: 7.9483809	total: 65.6ms	remaining: 9.31s
7:	learn: 7.7888555	total: 67.2ms	remaining: 8.33s
8:	learn: 7.6364669	total: 68.4ms	remaining: 7.53s
9:	learn: 7.4720103	total: 69.5ms	remaining: 6.88s
10:	learn: 7.3077275	total: 70.4ms	remaining: 6.33s
11:	learn: 7.1638149	total: 71.7ms	remaining: 5.9s
12:	learn: 7.0211266	total: 72.8ms	remaining: 5.53s
13:	learn: 6.8763643	total: 73.9ms	remaining: 5.2s
14:	learn: 6.7567806	total: 75.4ms	remaining: 4.95s
15:	learn: 6.6149514	total: 76.6ms	remaining: 4.71s
16:	learn: 6.4856242	total: 77.7ms	remaining: 4.49s
17:	learn: 6.3663360	total: 78.7ms	remaining: 4.29s
18:	learn: 6.2504182	total: 79.6ms	remaining: 

In [6]:
mse_full = {}
for model in models.keys():
    mse_full[model] = mean_squared_error(models[model].predict(X),y)

In [7]:
acme = {}
for model in models.keys():
    acme[model] = ACME(models[model],'target')
    acme[model] = acme[model].fit(dataframe, robust=True)

In [8]:
k = 5
top_k = {}
last_k = {}
for model in models.keys():
    top_k[model] = acme[model].feature_importance().index.tolist()[0:k]
    last_k[model] = acme[model].feature_importance().index.tolist()[-k:]

In [9]:
top_k

{'linear_regression': ['LSTAT', 'DIS', 'RM', 'RAD', 'PTRATIO'],
 'random_forest_regressor': ['RM', 'LSTAT', 'AGE', 'PTRATIO', 'B'],
 'cat_boost_regressor': ['RM', 'LSTAT', 'AGE', 'DIS', 'PTRATIO'],
 'svr': ['TAX', 'B', 'AGE', 'ZN', 'LSTAT']}

In [10]:
models_top_k = {}
models_top_k['linear_regression'] = LinearRegression().fit(dataframe[top_k['linear_regression']],y)
models_top_k['random_forest_regressor'] = RandomForestRegressor().fit(dataframe[top_k['random_forest_regressor']],y)
models_top_k['cat_boost_regressor'] = CatBoostRegressor().fit(dataframe[top_k['cat_boost_regressor']],y)
models_top_k['svr'] = SVR().fit(dataframe[top_k['svr']],y)

Learning rate set to 0.036765
0:	learn: 8.9970072	total: 987us	remaining: 986ms
1:	learn: 8.7867793	total: 1.85ms	remaining: 923ms
2:	learn: 8.5997204	total: 2.62ms	remaining: 870ms
3:	learn: 8.4168349	total: 3.36ms	remaining: 838ms
4:	learn: 8.2233280	total: 4.13ms	remaining: 823ms
5:	learn: 8.0456962	total: 5.11ms	remaining: 847ms
6:	learn: 7.8689726	total: 5.79ms	remaining: 822ms
7:	learn: 7.6886515	total: 6.49ms	remaining: 805ms
8:	learn: 7.5067089	total: 7.2ms	remaining: 793ms
9:	learn: 7.3314043	total: 7.92ms	remaining: 784ms
10:	learn: 7.1772858	total: 8.63ms	remaining: 776ms
11:	learn: 7.0291649	total: 9.33ms	remaining: 768ms
12:	learn: 6.8859168	total: 9.97ms	remaining: 757ms
13:	learn: 6.7614525	total: 10.8ms	remaining: 761ms
14:	learn: 6.6237307	total: 11.5ms	remaining: 755ms
15:	learn: 6.5049482	total: 12.1ms	remaining: 746ms
16:	learn: 6.3827838	total: 13.2ms	remaining: 761ms
17:	learn: 6.2542276	total: 14ms	remaining: 765ms
18:	learn: 6.1372373	total: 14.8ms	remaining: 76

In [11]:
mse_top_k = {}
for model in models.keys():
    mse_top_k[model] = mean_squared_error(models_top_k[model].predict(dataframe[top_k[model]]),y)

In [12]:
mse_full

{'linear_regression': 21.894831181729202,
 'random_forest_regressor': 1.4955456047430824,
 'cat_boost_regressor': 0.3362474873976234,
 'svr': 66.81823779202165}

In [13]:
mse_top_k

{'linear_regression': 26.099816903706003,
 'random_forest_regressor': 2.0980090197628467,
 'cat_boost_regressor': 0.8634103986654711,
 'svr': 66.50031825228734}

In [26]:
models_last_k = {}
models_last_k['linear_regression'] = LinearRegression().fit(dataframe.drop(columns = top_k['linear_regression'] + ['target']),y)
models_last_k['random_forest_regressor'] = RandomForestRegressor().fit(dataframe.drop(columns = top_k['random_forest_regressor']+ ['target']),y)
models_last_k['cat_boost_regressor'] = CatBoostRegressor().fit(dataframe.drop(columns = top_k['cat_boost_regressor']+ ['target']),y)
models_last_k['svr'] = SVR().fit(dataframe.drop(columns = top_k['svr']+ ['target']),y)

Learning rate set to 0.036765
0:	learn: 9.0462505	total: 884us	remaining: 884ms
1:	learn: 8.9109252	total: 1.52ms	remaining: 760ms
2:	learn: 8.7954552	total: 2.19ms	remaining: 730ms
3:	learn: 8.6857806	total: 2.64ms	remaining: 658ms
4:	learn: 8.5770326	total: 3.25ms	remaining: 647ms
5:	learn: 8.4495776	total: 4.06ms	remaining: 672ms
6:	learn: 8.3587197	total: 4.81ms	remaining: 683ms
7:	learn: 8.2655263	total: 5.3ms	remaining: 657ms
8:	learn: 8.1621531	total: 6.07ms	remaining: 668ms
9:	learn: 8.0681773	total: 6.75ms	remaining: 669ms
10:	learn: 7.9752753	total: 7.48ms	remaining: 673ms
11:	learn: 7.8859894	total: 8.13ms	remaining: 670ms
12:	learn: 7.8143161	total: 9.16ms	remaining: 696ms
13:	learn: 7.7445747	total: 9.98ms	remaining: 703ms
14:	learn: 7.6513356	total: 10.7ms	remaining: 702ms
15:	learn: 7.5752428	total: 11.5ms	remaining: 707ms
16:	learn: 7.5080203	total: 12.3ms	remaining: 710ms
17:	learn: 7.4313967	total: 12.9ms	remaining: 704ms
18:	learn: 7.3537931	total: 13.7ms	remaining: 

In [27]:
mse_last_k = {}
for model in models.keys():
    mse_last_k[model] = mean_squared_error(models_last_k[model].predict(dataframe.drop(columns = top_k[model]+ ['target'])),y)

In [28]:
mse_last_k

{'linear_regression': 55.155023598342325,
 'random_forest_regressor': 4.4194446976284505,
 'cat_boost_regressor': 3.802887869326224,
 'svr': 55.163510150619736}