In [5]:
import numpy as np
import pandas as pd
import random
import math
from data_loader import load_bilibili_data
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit
import matplotlib.pyplot as plt

def fit_linear_regression(Xmat_train, Y_train, Xmat_val, Y_val):
    # ==================================
    # BASELINE MODEL: LINEAR REGRESSION
    # ==================================
    baseline_model = LinearRegression()
    baseline_model.fit(Xmat_train, Y_train)

    # Obtain the coefficient of determination by calling the model with the score() function, then print the coefficient:
    r_squared_train = baseline_model.score(Xmat_train, Y_train)
    print('R-sqaured on training set:', r_squared_train)

    r_squared_val = baseline_model.score(Xmat_val, Y_val)
    print('R-sqaured on validation set:', r_squared_val)


def fit_polynomial_regression(Xmat_train_and_val, Y_train_and_val, split_index):
    # =====================
    # POLYNOMIAL REGRESSION
    # =====================
    steps = [
        ('poly', PolynomialFeatures()),
        ('model', Ridge()) #Lasso(alpha=0.9, max_iter=10000, fit_intercept=True))
    ]

    pipeline = Pipeline(steps)

    # =================== 1st Grid Search ===================
    degrees = [2, 3]
    alphas = [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000]

    param_grid = {
        "poly__degree" : degrees,
        "model__alpha" : alphas,
    }

    # Use the list split_index to create PredefinedSplit
    pds = PredefinedSplit(test_fold = split_index)

    search = GridSearchCV(estimator=pipeline,
                          param_grid=param_grid,
                          scoring="r2",
                          cv=pds,
                          verbose=3)

    search.fit(Xmat_train_and_val, Y_train_and_val)

    df_gridsearch = pd.DataFrame(search.cv_results_)

    scores = search.cv_results_['split0_test_score']

    # replace negatives scores with zero, then transform into the desired shape
    scores[scores < 0] = 0
    scores = np.array(scores)
    scores = np.transpose(np.vstack(np.split(scores, len(alphas))))

    # Plot using Matplotlib
    for i, degree in enumerate(degrees):
        plt.plot(np.log(alphas), scores[i], label='degree: ' + str(degree))

    plt.legend()
    plt.xlabel('log (Alpha)')
    plt.ylabel('R Sqaured')
    plt.show()


    # =================== 2nd Grid Search ===================
    degrees = [2]
    alphas = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

    param_grid = {
        "poly__degree" : degrees,
        "model__alpha" : alphas,
    }

    # Use the list split_index to create PredefinedSplit
    pds = PredefinedSplit(test_fold = split_index)

    search = GridSearchCV(estimator=pipeline,
                          param_grid=param_grid,
                          scoring="r2",
                          cv=pds,
                          verbose=3)

    search.fit(Xmat_train_and_val, Y_train_and_val)

    df_gridsearch = pd.DataFrame(search.cv_results_)

    scores = search.cv_results_['split0_test_score']

    # replace negatives scores with zero, then transform into the desired shape
    scores[scores < 0] = 0
    scores = np.array(scores)
    scores = np.transpose(np.vstack(np.split(scores, len(alphas))))

    # Plot using Matplotlib
    for i, degree in enumerate(degrees):
        plt.plot(alphas, scores[i], label='degree: ' + str(degree))

    plt.legend()
    plt.xlabel('Alpha')
    plt.ylabel('R Sqaured')
    plt.show()


def fit_random_forest(Xmat_train_and_val, Y_train_and_val, split_index):
    # =========================
    # 🌲🌲🌲 RANDOM FOREST 🌲🌲🌲
    # =========================

    # [CV 1/1] END bootstrap=True, max_depth=None, max_features=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.582 total time=   1.6s

    # Number of trees in random forest
    n_estimators = [10, 100, 500] # [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

    # Number of features to consider at every split
    max_features = ['sqrt']

    # Maximum number of levels in tree
    max_depth = [10, 100] # [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)

    # Minimum number of samples required to split a node
    min_samples_split = [2]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1]

    # Method of selecting samples for training each tree
    bootstrap = [True] #[True, False]

    # Create the random grid
    param_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    rf = RandomForestRegressor()

    # Use the list split_index to create PredefinedSplit (predefined Validation set)
    pds = PredefinedSplit(test_fold = split_index)


    search = GridSearchCV(estimator=rf,
                          param_grid=param_grid,
                          scoring="r2",
                          cv=pds,
                          verbose=3,
                          n_jobs=-1)

    search.fit(Xmat_train_and_val, Y_train_and_val)
    
    return search.cv_results_

#     df_gridsearch = pd.DataFrame(search.cv_results_)

#     scores = search.cv_results_['split0_test_score']

    # Random search of parameters, using predefined validation set,
    # search across 100 different combinations, and use all available cores.
    # P.S. n_jobs = -1 means use all processors to run them in parallel
    # rf_random = RandomizedSearchCV(estimator=rf,
    #                                param_distributions=param_grid,
    #                                scoring="r2",
    #                                n_iter=100,
    #                                cv=pds,
    #                                verbose=3,
    #                                random_state=2022,
    #                                n_jobs=-1)
    #
    # rf_random.fit(Xmat_train_and_val, Y_train_and_val)

def main():

    # All of those are pandas objects
    Xmat_train_and_val, Y_train_and_val, Xmat_train, Xmat_val, Xmat_test, Y_train, Y_val, Y_test = load_bilibili_data()

    # Create a list where train data indices are -1 and validation data indices are 0
    split_index = [-1 if x in Xmat_train.index else 0 for x in Xmat_train_and_val.index]

    # fit_linear_regression(Xmat_train, Y_train, Xmat_val, Y_val)
    #
    # fit_polynomial_regression(Xmat_train_and_val, Y_train_and_val, split_index)

    result = fit_random_forest(Xmat_train_and_val, Y_train_and_val, split_index)

    #
    # for i in range(29, 40):
    #     print("max_features = ", i)
    #     forest = RandomForestRegressor(n_estimators=100, max_features=i)
    #     forest.fit(Xmat_train, Y_train)
    #     print('Training score: {}'.format(forest.score(Xmat_train, Y_train)))
    #     print('Validation score: {}'.format(forest.score(Xmat_val, Y_val)))
    #     print("\n")


main()


Fitting 1 folds for each of 9 candidates, totalling 9 fits
[CV 1/1] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.359 total time=   2.2s
[CV 1/1] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.236 total time=   0.8s
[CV 1/1] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.361 total time=   6.0s
[CV 1/1] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500;, score=0.372 total time=  12.4s


In [7]:
result

NameError: name 'result' is not defined