# ML regression template

- metric: RMSE

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#ensure that plots are displayed inside the notebook
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor

## <u> Preprocessing

In [None]:
def preprocess(filename, target, random_s, proportion):
    # parse the data in a dataftame
    df_ = pd.read_csv(filename)
    
    # performe one-hot encoding to convert categorical variables into dummy variables (ensure features such as name, id are removed before)
    df = pd.get_dummies(df_)
    names = df.columns
    features = list(names)
    features.remove(target)

    # standardize the data
#     scaler = StandardScaler()
#     scaled_df = scaler.fit_transform(df)
#     scaled_df = pd.DataFrame(scaled_df, columns=names)
#   
#     # set the target and explanatory variables
#     y = scaled_df[target]
#     X = scaled_df[features]
    
    
    # set the target and explanatory variables
    y = df[target]
    X = df[features]
    
    # split the data in train and test set
    r = random_s  # controls how the data are split in train and test sets
    proportion_test = proportion  # proportion of data that is sampled as test set
    
    return train_test_split(X, y, test_size=proportion_test, random_state=r)

In [None]:
file = 'data.csv'
tar = 'target'
ran_state = 1
prop = 0.3

## <u> Prediction models

### 1.1) Linear Regression 

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# creates the Linear Regression model
lm = LinearRegression()

In [None]:
# fit the train data to the model and print the R^2 of train and test data
lm.fit(X_train,y_train)
print('Train score (R^2):', lm.score(X_train, y_train))
print('Test score (R^2):', lm.score(X_test, y_test))

In [None]:
# compute RMSE
y_pred = lm.predict(X_test) 
rmse1 = math.sqrt(mean_squared_error(y_test, y_pred))
print('Root mean squared error:', rmse1)

In [None]:
# plot the real values against the predicted values on the test set in a scatter plot
plt.scatter(y_test, y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Linear Regression");

In [None]:
# plot the real values against the predicted values on the test set in a line plot
plt.figure(figsize=(12,8))
plt.plot(np.array(y_test), color='red')
plt.plot(y_pred, color='blue')
plt.xlabel("Observations")
plt.ylabel("Predicted value")
plt.legend(labels = ['real','predicted'])
plt.title('Linear Regression')
plt.show()

In [None]:
# compute feature importance
for feature, importance in zip(features, lm.coef_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make prediction
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
predicted_values = lm.predict(d)
print("The predictions are: ", predicted_values)

### 1.2) Linear Regression with Backward Elimination (feature selection)

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# select the best features with Backward Elimination
X_1 = sm.add_constant(X_train) # add dummy feature
model = sm.OLS(y_train,X_1).fit() # fit the linear model
cols = list(X_train.columns)
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = X_train[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y_train,X_1).fit()
    p = pd.Series(model.pvalues,index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols
# print(selected_features_BE)

In [None]:
# creates the Linear Regression model
lm2 = LinearRegression()

In [None]:
# fit the train data to the model and print the R^2 of train and test data
lm2.fit(X_train[selected_features_BE],y_train)
print('Train score (R^2):', lm2.score(X_train[selected_features_BE], y_train))
print('Test score (R^2):', lm2.score(X_test[selected_features_BE], y_test))

In [None]:
# compute RMSE
y_pred = lm2.predict(X_test[selected_features_BE]) 
rmse2 = math.sqrt(mean_squared_error(y_test, y_pred))
print('Root mean squared error:', rmse2)

In [None]:
# plot the real values against the predicted values on the test set in a scatter plot
plt.scatter(y_test, y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Linear Regression with Backward Elimination");

In [None]:
# plot the real values against the predicted values on the test set in a line plot
plt.figure(figsize=(12,8))
plt.plot(np.array(y_test), color='red')
plt.plot(y_pred, color='blue')
plt.xlabel("Observations")
plt.ylabel("Predicted value")
plt.legend(labels = ['real','predicted'])
plt.title('Linear Regression with Backward Elimination')
plt.show()

In [None]:
# compute feature importance
for feature, importance in zip(selected_features_BE, lm2.coef_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make prediction
d =  # dataframe with Backward Elimination selected features
# e.g. d = X_test[selected_features_BE].tail(3)
# display(d)
predicted_values = lm2.predict(d)
print("The predictions are: ", predicted_values)

### 2.1) Random Forest

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# create the Random Forest model with 1000 trees in the forest
rf = RandomForestRegressor(n_estimators = 1000) 

In [None]:
# fit the train data to the model and print the R^2 of train and test data
rf.fit(X_train, y_train)
print('Train score (R^2):', rf.score(X_train, y_train))
print('Test score (R^2):', rf.score(X_test, y_test))

In [None]:
# compute RMSE
y_pred = rf.predict(X_test) 
rmse3 = math.sqrt(mean_squared_error(y_test, y_pred))
print('Root mean squared error:', rmse3)

In [None]:
# plot the real values against the predicted values on the test set in a scatter plot
plt.scatter(y_test, y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Random Forest");

In [None]:
# plot the real values against the predicted values on the test set in a line plot
plt.figure(figsize=(12,8))
plt.plot(np.array(y_test), color='red')
plt.plot(y_pred, color='blue')
plt.xlabel("Observations")
plt.ylabel("Predicted value")
plt.legend(labels = ['real','predicted'])
plt.title('Random Forest')
plt.show()

In [None]:
# compute feature importance
for feature, importance in zip(features, rf.feature_importances_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make prediction
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
predicted_values = rf.predict(d)
print("The predictions are: ", predicted_values)

### 2.2) Random forest with Random Search (hyperparameters tuning)

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# tuning hyperparameters by randomly sampling from given parameters

# define values for hyperparameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 200)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# create the Random Forest model 
rf2 = RandomForestRegressor()
# Random Search of parameters using 5-fold Cross Validation
rf_random = RandomizedSearchCV(estimator = rf2, param_distributions = random_grid, scoring='neg_root_mean_squared_error', n_iter = 10, cv = 5, random_state = 42, n_jobs = -1)
# fit the train data to the model
rf_random.fit(X_train, y_train)

In [None]:
# create the Random Forest model with the best hyperparameters after Random Search
d = rf_random.best_params_
rf2 = RandomForestRegressor(n_estimators = d['n_estimators'], min_samples_split = d['min_samples_split'], min_samples_leaf = d['min_samples_leaf'], max_features = d['max_features'], max_depth = d['max_depth'], bootstrap = d['bootstrap'])

In [None]:
# fit the train data to the model and print the R^2 of train and test data
rf2.fit(X_train,y_train)
print('Train score (R^2):', rf2.score(X_train, y_train))
print('Test score (R^2):', rf2.score(X_test, y_test))

In [None]:
# compute RMSE
y_pred = rf2.predict(X_test) 
rmse4 = math.sqrt(mean_squared_error(y_test, y_pred))
print('Root mean squared error:', rmse4)

In [None]:
# plot the real values against the predicted values on the test set in a scatter plot
plt.scatter(y_test, y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Random Forest with Random Search");

In [None]:
# plot the real values against the predicted values on the test set in a line plot
plt.figure(figsize=(12,8))
plt.plot(np.array(y_test), color='red')
plt.plot(y_pred, color='blue')
plt.xlabel("Observations")
plt.ylabel("Predicted value")
plt.legend(labels = ['real','predicted'])
plt.title('Random Forest with Random Search')
plt.show()

In [None]:
# compute feature importance
for feature, importance in zip(features, rf2.feature_importances_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make prediction
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
predicted_values = rf2.predict(d)
print("The predictions are: ", predicted_values)

### 3.1) Gradient Boosting

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# create the Gradient Boosting model with the following hyperparameters 
params = {
    'learning_rate': 0.05,
    "num_leaves": 1000,  
    "n_estimators": 1000
}
gbm = lgb.LGBMRegressor(**params)

In [None]:
# fit the train data to the model and print the R^2 of train and test data
gbm.fit(X_train, y_train);
print('Train score (R^2):', gbm.score(X_train, y_train))
print('Test score (R^2):', gbm.score(X_test, y_test))

In [None]:
# compute RMSE
y_pred = gbm.predict(X_test, num_iterations = 1000)
rmse5 = math.sqrt(mean_squared_error(y_test, y_pred))
print('Root mean squared error:', rmse5)

In [None]:
# plot the real values against the predicted values on the test set in a scatter plot
plt.scatter(y_test, y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Gradient Boosting");

In [None]:
# plot the real values against the predicted values on the test set in a line plot
plt.figure(figsize=(12,8))
plt.plot(np.array(y_test), color='red')
plt.plot(y_pred, color='blue')
plt.xlabel("Observations")
plt.ylabel("Predicted value")
plt.legend(labels = ['real','predicted'])
plt.title('Gradient Boosting')
plt.show()

In [None]:
# compute feature importance
for feature, importance in zip(features, gbm.feature_importances_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make prediction
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
predicted_values = gbm.predict(d)
print("The predictions are: ", predicted_values)

### 3.2) Gradient Boosting with Random Search (hyperparameters tuning)

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# tuning hyperparameters by randomly sampling from given parameters.

# define values for hyperparameters
learning_rate = [x for x in np.linspace(0.01, 1, num = 100)]
num_leaves = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 200)]
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 200)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 100)]
max_depth.append(None)

# create the random grid
random_grid = {'n_estimators': n_estimators,
               'num_leaves': num_leaves,
               'max_depth': max_depth,
               'learning_rate': learning_rate}

# create the Gradient Boosting model
gb = lgb.LGBMRegressor()
# Random Search of parameters using 5-fold Cross Validation
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, scoring='neg_root_mean_squared_error', n_iter = 10, cv = 5, random_state = 42, n_jobs = -1);
# fit the train data to the model
gb_random.fit(X_train, y_train);

In [None]:
# create the Gradient Boosting model with the best hyperparameters after Random Search
d = gb_random.best_params_
gbm2 = lgb.LGBMRegressor(num_leaves = d['num_leaves'],n_estimators = d['n_estimators'],max_depth = d['max_depth'], learning_rate = d['learning_rate'])

In [None]:
# fit the train data to the model and print the R^2 of train and test data
gbm2.fit(X_train,y_train);
print('train score (R^2):', gbm2.score(X_train, y_train))
print('test score (R^2):', gbm2.score(X_test, y_test))

In [None]:
# compute RMSE
y_pred = gbm2.predict(X_test)
rmse6 = math.sqrt(mean_squared_error(y_test, y_pred))
print('Root mean squared error:', rmse6)

In [None]:
# plot the real values against the predicted values on the test set in a scatter plot
plt.scatter(y_test, y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Gradient Boosting with Random Search");

In [None]:
# plot the real values against the predicted values on the test set in a line plot
plt.figure(figsize=(12,8))
plt.plot(np.array(y_test), color='red')
plt.plot(y_pred, color='blue')
plt.xlabel("Observations")
plt.ylabel("Predicted value")
plt.legend(labels = ['real','predicted'])
plt.title('Gradient Boosting with Random Search')
plt.show()

In [None]:
# compute feature importance
for feature, importance in zip(features, gbm2.feature_importances_):
    print('Feature: ', feature, '\t','Importance: ', importance)

In [None]:
# use the model to make prediction
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
predicted_values = gbm2.predict(d)
print("The predictions are: ", predicted_values)

### 4.1) Neural Network 

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
from numpy.random import seed
seed(1)

# create the Neural Network 
ann = Sequential()
# add the input layer and the first hidden layer
ann.add(Dense(32, activation = 'relu', input_dim = len(features)))
# add the second hidden layer
ann.add(Dense(units = 32, activation = 'relu'))
# add the third hidden layer
ann.add(Dense(units = 32, activation = 'relu'))
# add the output layer
ann.add(Dense(units = 1))
# compile the ANN
ann.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
# fit the train data to the model
ann.fit(X_train, y_train, batch_size = 10, epochs = 100);

In [None]:
# compute RMSE
y_pred = ann.predict(X_test);
rmse7 = math.sqrt(mean_squared_error(y_test, y_pred))
print('Root mean squared error:', rmse7)

In [None]:
# plot the real values against the predicted values on the test set in a scatter plot
plt.scatter(y_test, y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Neural Network");

In [None]:
# plot the real values against the predicted values on the test set in a line plot
plt.figure(figsize=(12,8))
plt.plot(np.array(y_test), color='red')
plt.plot(y_pred, color='blue')
plt.xlabel("Observations")
plt.ylabel("Predicted value")
plt.legend(labels = ['real','predicted'])
plt.title('Neural Network')
plt.show()

In [None]:
# use the model to make prediction
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
predicted_values = ann.predict(d)
print("The predictions are: ", predicted_values)

### 4.2) Neural Network with Grid Search (hyperparameters tuning)

In [None]:
X_train, X_test, y_train, y_test = preprocess(file, tar, ran_state, prop)
features = list(X_train.columns)

In [None]:
# tuning hyperparameters by picking all combinations of the given parameters

def create_model():
    # create the Neural Network 
    model = Sequential()
    model.add(Dense(32, input_dim=len(features), activation='relu'))
    model.add(Dense(units = 32, activation = 'relu'))
    model.add(Dense(units = 32, activation = 'relu'))
    model.add(Dense(1))
    # compile the ANN
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    return model

# define values for hyperparameters
batch_size = [10, 20, 40, 60]
epochs = [10, 50, 100]
# create the grid
param_grid = dict(batch_size=batch_size, epochs=epochs)

# create the Neural Network
nn = KerasRegressor(build_fn=create_model)
# Grid Search of parameters using 5-fold Cross Validation
nn_grid = GridSearchCV(estimator=nn, param_grid=param_grid, n_jobs=-1, cv=5, scoring='neg_root_mean_squared_error')
# fit the train data to the model
grid_result = nn_grid.fit(X_train, y_train)

In [None]:
# create the Neural Network model with the best hyperparameters after Grid Search
d = grid_result.best_params_
# create the Neural Network 
ann2 = Sequential()
# add the input layer and the first hidden layer
ann2.add(Dense(units = 32, activation = 'relu', input_dim = len(features)))
# add the second hidden layer
ann2.add(Dense(units = 32, activation = 'relu'))
# add the third hidden layer
ann2.add(Dense(units = 32, activation = 'relu'))
# add the output layer
ann2.add(Dense(units = 1))
# compile the ANN
ann2.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
# fit the train data to the model
ann2.fit(X_train, y_train, batch_size = d['batch_size'], epochs =d['epochs']);

In [None]:
# compute RMSE
y_pred = ann2.predict(X_test);
rmse8 = math.sqrt(mean_squared_error(y_test, y_pred))
print('Root mean squared error:', rmse8)

In [None]:
# plot the real values against the predicted values on the test set in a scatter plot
plt.scatter(y_test, y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Neural Network with Grid Search");

In [None]:
# plot the real values against the predicted values on the test set in a line plot
plt.figure(figsize=(12,8))
plt.plot(np.array(y_test), color='red')
plt.plot(y_pred, color='blue')
plt.xlabel("Observations")
plt.ylabel("Predicted value")
plt.legend(labels = ['real','predicted'])
plt.title('Neural Network with Grid Search')
plt.show()

In [None]:
# use the model to make prediction
d =  # dataframe with explanatory features
# e.g. d = X_test.tail(3)
# display(d)
predicted_values = ann2.predict(d)
print("The predictions are: ", predicted_values)

## <u> Results

In [None]:
print('Linear Regression:                               ', rmse1)
print('Linear Regression with Backward Elimination:     ', rmse2)
print()
print('Random Forest:                                   ', rmse3)
print('Random Forest with Random Search:                ', rmse4)
print()
print('Gradient Boosting:                               ', rmse5)
print('Gradient Boosting with Random Search:            ', rmse6)
print()
print('Neural Network:                                  ', rmse7)
print('Neural Network with Grid Search:                 ', rmse8)