In [3]:
# example of a super learner model for regression
from math import sqrt
from numpy import hstack
from numpy import vstack
from numpy import asarray
import numpy as np
import pandas as pd
import os
import sys
from tqdm.notebook import tqdm
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic
from geopy.point import Point
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [18]:
with open('/content/final_dict.pkl', 'rb') as f:
    final_dict = pickle.load(f)

In [19]:
# create a list of base-models
def get_models():
	models = list()
	models.append(LinearRegression())
	models.append(SVR(gamma='scale'))
	models.append(KNeighborsRegressor())
	models.append(RandomForestRegressor(n_estimators=10))
	models.append(GradientBoostingRegressor())
	return models

In [20]:
# collect out of fold predictions form k-fold cross validation
def get_out_of_fold_predictions(X, y, models):
	meta_X, meta_y = list(), list()
	# define split of data
	kfold = KFold(n_splits=10, shuffle=True)
	# enumerate splits
	for train_ix, test_ix in kfold.split(X):
		fold_yhats = list()
		# get data
		train_X, test_X = X[train_ix], X[test_ix]
		train_y, test_y = y[train_ix], y[test_ix]
		meta_y.extend(test_y)
		# fit and make predictions with each sub-model
		for model in models:
			model.fit(train_X, train_y)
			yhat = model.predict(test_X)
			# store columns
			fold_yhats.append(yhat.reshape(len(yhat),1))
		# store fold yhats as columns
		meta_X.append(hstack(fold_yhats))
	return vstack(meta_X), asarray(meta_y)


In [21]:
# fit all base models on the training dataset
def fit_base_models(X, y, models):
	for model in models:
		model.fit(X, y)

# fit a meta model
def fit_meta_model(X, y):
	model = LinearRegression()
	model.fit(X, y)
	return model

In [22]:
# evaluate a list of models on a dataset
def evaluate_models(X, y, models):
	for model in models:
		yhat = model.predict(X)
		mse = mean_squared_error(y, yhat)
		print('%s: RMSE %.3f' % (model.__class__.__name__, sqrt(mse)))

# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
	meta_X = list()
	for model in models:
		yhat = model.predict(X)
		meta_X.append(yhat.reshape(len(yhat),1))
	meta_X = hstack(meta_X)
	# predict
	return meta_model.predict(meta_X)

In [23]:
from sklearn import model_selection

In [30]:
pm25_name_dict = {0: 'pm25_1211220109', 1: 'pm25_1212220023', 2: 'pm25_1212220165', 3: 'pm25_1212220169', 4: 'pm25_1201230044', 5: 'pm25_1210220027', 6: 'pm25_1212220159', 7: 'PM25_2CF4328C5AB4', 8: 'PM25.1_2CF4328C5BAD', 9: 'PM25.2_3083988F25D6', 10: 'PM25.3_5002914AB52D', 11: 'PM25.4_8CAAB56B9EAF', 12: 'PM25.5_E0980690FF7B', 13: 'PM25.6_F4CFA2D625EE'}

In [31]:
num_rows, num_columns = final_dict[pm25_name_dict[0]]['2023-01-05 13:00:00']['X'].shape
print(f"num_rows = {num_rows}, num_columns = {num_columns}")
X_train, X_test = np.empty((0, num_columns), dtype=np.float32), np.empty((0, num_columns), dtype=np.float32)
y_train, y_test = np.empty((0, 1), dtype=np.float32), np.empty((0, 1), dtype=np.float32)
for key in final_dict[pm25_name_dict[0]].keys():

    Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split(final_dict[pm25_name_dict[0]][key]['X'],
                                                                    final_dict[pm25_name_dict[0]][key]['y'],
                                                                    test_size=1/13,
                                                                    shuffle=True,
                                                                    random_state=482)

    X_train = np.concatenate((X_train, Xtrain), axis=0)
    X_test = np.concatenate((X_test, Xtest), axis=0)
    y_train = np.concatenate((y_train, ytrain), axis=0)
    y_test = np.concatenate((y_test, ytest), axis=0)
print(f"X_train.shape = {X_train.shape}, y_train.shape = {y_train.shape}")
print(f"X_test.shape = {X_test.shape}, y_test.shape = {y_test.shape}")

num_rows = 13, num_columns = 6
X_train.shape = (1104, 6), y_train.shape = (1104, 1)
X_test.shape = (92, 6), y_test.shape = (92, 1)


In [32]:
X, X_val, y, y_val = X_train,X_test,y_train,y_test
print('Train', X.shape, y.shape, 'Test', X_val.shape, y_val.shape)

Train (1104, 6) (1104, 1) Test (92, 6) (92, 1)


In [33]:
# get models
models = get_models()

In [34]:
# get out of fold predictions
meta_X, meta_y = get_out_of_fold_predictions(X, y, models)
print('Meta ', meta_X.shape, meta_y.shape)

  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  model.fit(train_X, train_y)
  y = column_or_1d(y, warn=True)


Meta  (1104, 5) (1104, 1)


In [35]:
# fit base models
fit_base_models(X, y, models)

  y = column_or_1d(y, warn=True)
  model.fit(X, y)
  y = column_or_1d(y, warn=True)


In [36]:
# fit the meta model
meta_model = fit_meta_model(meta_X, meta_y)

In [37]:
# evaluate base models
evaluate_models(X_val, y_val, models)

LinearRegression: RMSE 34.910
SVR: RMSE 37.598
KNeighborsRegressor: RMSE 36.981
RandomForestRegressor: RMSE 22.096
GradientBoostingRegressor: RMSE 19.925


In [38]:
# evaluate meta model
yhat = super_learner_predictions(X_val, models, meta_model)
print('Super Learner: RMSE %.3f' % (sqrt(mean_squared_error(y_val, yhat))))

Super Learner: RMSE 19.466


In [39]:
from sklearn import metrics

In [40]:
metrics.r2_score(y_val,yhat)

0.9046856373486232