In [1]:
# Gradient Boosting Regression Algorithm V2
# with target values as triangular fuzzy numbers
# using different fuzzy distances and defuzzification methods
# by Resmiye Nasiboglu and Efendi Nasibov
# September, 2022

from sklearn import datasets
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns
import pandas as pd
import numpy as np
import random as r
import math
from my_datasets import *
from fuzzy_operations import *

from warnings import filterwarnings
filterwarnings('ignore')

"""
fuzzyGBR - the main function with following parameters:
dataset - is the dataset to handle ("iris", "cars", "diabetes", "boston", "penguins", "planets", 
        "diamonds", "mpg", "tips", taxis")
defuz_method - the defuzzification method: 
        "MOM" - mean of maxima, 
        "COG" - center of gravity, 
        "WABL" - weighted average based ol levels.
max_left_spread - the maximum left side spread of the simulating target fuzzy numbers,
max_right_spread - the maximum right side spread of the simulating target fuzzy numbers,
distance - the fuzzy distance measures between the FNs A = (a[0],a[1],a[2]) and B = (b[0],b[1],b[2]):
        D1(A, B) = max(abs(a[0]-b[0]),abs(a[1]-b[1]),abs(a[2]-b[2])) 
        D2(A, B) = abs(a[0]-b[0])+max(abs(a[1]-b[1]),abs(a[2]-b[2])) 
        D3(A, B) = abs(defuz(A)-defuz(B)) 
        D4(A, B) = abs(defuz(fuzSubtr(A,B))) 
boost_iterations - the number of boosting iterations,
learning_rate - the learning rate of algorithm,
tree_depth - the depth of the stump trees.
"""

def fuzzyGBR(dataset, defuz_method = "MOM", max_left_spread = 0.2, max_right_spread = 0.2, 
         distance = "D4", boost_iterations = 201, learning_rate = 0.1, tree_depth = 1):

    print("===== PARAMETERS =====")
    print("Dataset = ", dataset)
    print("Defuzification method = ", defuz_method)
    print("Fuzzy distance = ", distance)
    print("Left spread = %.1f" % (max_left_spread))
    print("Right spread = %.1f" % (max_right_spread))
    print("Learning rate = %.2f" % (learning_rate))
    print("Stump tree depth = %d" % (tree_depth))

    def c_cycle(c):
        # c is the optimism parameter of the WABL method.
        print("============= Optimizm index = %.1f" % (c))
        if dataset == "iris":
            X_train,X_test,y_train,y_test = my_load_iris()
        elif dataset == "cars":
            X_train,X_test,y_train,y_test = my_load_car_prices()
        elif dataset == "diabetes":
            X_train,X_test,y_train,y_test = my_load_diabetes()
        elif dataset == "boston":
            X_train,X_test,y_train,y_test = my_load_boston()
        elif dataset == "penguins":
            X_train,X_test,y_train,y_test = my_load_penguins()
        elif dataset == "planets":
            X_train,X_test,y_train,y_test = my_load_planets()
        elif dataset == "diamonds":
            X_train,X_test,y_train,y_test = my_load_diamonds()
        elif dataset == "mpg":
            X_train,X_test,y_train,y_test = my_load_mpg()
        elif dataset == "tips":
            X_train,X_test,y_train,y_test = my_load_tips()
        elif dataset == "taxis":
            X_train,X_test,y_train,y_test = my_load_taxis()

        r.seed(0) # initialization of the random generator.

        # fuzzy number representation: A=(mode,l_width,r_width)

        # in experiments the folowing forms of the FN should be used
        #     l_max_width=0.2 and r_max_witth=0.2  (symmetrical case)
        #     l_max_width=0.2 and r_max_witth=0.0  (left skewned case)
        #     l_max_width=0.0 and r_max_witth=0.2  (right skewned case)

        l_max_width = max_left_spread  # left max width of fuzzyness
        r_max_width = max_right_spread  # right max width of fuzzyness

        # reservation of empty fuzzy data 
        y_fuz_train=[[0,0,0] for _ in range(len(y_train))]
        y_fuz_test=[[0,0,0] for _ in range(len(y_test))]

        # generation of random fuzzy data 
        for i in range(len(y_train)):
            y_fuz_train[i] = [y_train[i],y_train[i]*(l_max_width*r.random()),y_train[i]*(r_max_width*r.random())]
        for i in range(len(y_test)):    
            y_fuz_test[i] = [y_test[i],y_test[i]*(l_max_width*r.random()),y_test[i]*(r_max_width*r.random())]


        # Standartization of the inputs

        sc = MinMaxScaler()
        X_train_std = sc.fit_transform(X_train)
        X_test_std = sc.transform(X_test)

        F=[[[0,0,0] for i in range(len(X_train))] for j in range(boost_iterations)]

        # fuzzy average of the fuzzy train outputs. c is the optimism parameter of the WABL.
        f_ave=fuzAve(y_fuz_train,c)

        # F[i] is the fuzzy outputs of the model after i.th iteration
        F[0]=[f_ave for _ in range(len(X_train))]

        # gamma is the predicted fuzzy output (as a Fuzzy Number) according to the leaf 
        gamma=[[[0,0,0] for i in range(max_leaf)] for j in range(boost_iterations)]
        trees=[]

        # boosting iterations
        for m in range(1,boost_iterations):    
            rrr=[fuzSubtr(y_fuz_train[i],F[m-1][i],c) for i in range(len(y_fuz_train))]

            # stump tree is constructed up to the defuzzified values of the FNs
            r1=[defuz(defuz_method,rrr[i],c) for i in range(len(rrr))]

            # constructing of the stump tree
            tree = DecisionTreeRegressor(random_state=0,max_depth=tree_depth)
            tree.fit(X_train_std, r1)
            trees.append(tree)

            # h is the list of the indices of the leafs
            h=tree.apply(X_train_std)   

            # h1 is the list of the distinct leaf indices 
            h1=list(set(h))

            for l in range(len(h1)):
                leaf_l=[j for j in range(len(r1)) if h[j]==h1[l]] 
                ss=[rrr[j] for j in leaf_l]
                ss1=np.reshape(ss,(-1,3))
                gamma[m][l]=fuzAve(ss1,c) #for each leaf node
                for k in leaf_l:
                    F[m][k]=fuzAdd(F[m-1][k],fuzMultBy(gamma[m][l],learning_rate,c),c) 

        # prediction
        #print("----------- Train set R^2 and fuzRMSE -------------------")
        maxR2=-999999
        minRMSE=-999999
        maxM=-1

        X1=X_train_std
        fuzY=y_fuz_train

        FM=F[0]  
        ave=FM[0]  # average of train set

        for m in range(1,boost_iterations):
            h=trees[m-1].apply(X1)
            h1=list(set(h))
            for l in range(len(h1)):
                leaf_l=[j for j in range(len(X_train)) if h[j]==h1[l]] 
                for k in leaf_l:
                    FF=fuzAdd(FM[k],fuzMultBy(gamma[m][l],learning_rate,c),c)
                    FM[k]=FF    #for each xi of each leaf node 

            R2=fuzR2(defuz_method,fuzY,FM,ave,c)
            RMSE=fuzRMSE(defuz_method,fuzY,FM,c)
            if R2>maxR2:
                maxR2=R2
                minRMSE=RMSE
                maxM=m
            if m%10==0:
                pass #print("%3d %10.4f %10.4f"%(m,R2,RMSE))

        print("-- TRAIN: the best R^2 and RMSE values with according iteration number --")
        print("maxM =",maxM)
        print("maxR2 = %10.4f"%(maxR2))
        print("minRMSE = %10.4f"%(minRMSE))

        #print("----------- Test set R^2 and fuzRMSE -------------------")
        maxR2=-999999
        minRMSE=-999999
        maxM=-1

        X1=X_test_std
        fuzY=y_fuz_test
        F=[[[0,0,0] for _ in range(len(X_test))] for i in range(boost_iterations)]

        # initial average of all test set with ave(train)
        F[0]=[fuzAve(y_fuz_train,c) for _ in range(len(X_test))] 
        FM=F[0] 
        #ave=FM[0]  # average of the train set

        for m in range(1,boost_iterations):
            h=trees[m-1].apply(X1)
            h1=list(set(h))
            for l in range(len(h1)):
                leaf_l=[j for j in range(len(X1)) if h[j]==h1[l]] 
                for k in leaf_l:
                    FF=fuzAdd(FM[k],fuzMultBy(gamma[m][l],learning_rate,c),c)
                    FM[k]=FF    #for each xi of each leaf node 

            R2=fuzR2(defuz_method,fuzY,FM,ave,c)
            RMSE=fuzRMSE(defuz_method,fuzY,FM,c)
            if R2>maxR2:
                maxR2=R2
                minRMSE=RMSE
                maxM=m
            if m%10==0:
                pass #print("%10.4f %10.4f"%(R2,RMSE))   

        print("-- TEST: the best R^2 and RMSE values with according iteration number --")
        print("maxM =",maxM)
        print("maxR2 = %10.4f"%(maxR2))
        print("minRMSE = %10.4f"%(minRMSE))
        

    max_leaf=2**tree_depth  # maximum leaf number of the stump trees
        
    if defuz_method == "WABL":
        cycles = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
    else:
        cycles = [0.5]
    for c in cycles:
        c_cycle(c)

# call the main function fuzzyGBR
fuzzyGBR(dataset = "mpg", defuz_method = "MOM", max_left_spread = 0.2, max_right_spread = 0.2, 
         distance = "D1", boost_iterations = 201, learning_rate = 0.1, tree_depth = 1)




===== PARAMETERS =====
Dataset =  mpg
Defuzification method =  MOM
Fuzzy distance =  D1
Left spread = 0.2
Right spread = 0.2
Learning rate = 0.10
Stump tree depth = 1
-- TRAIN: the best R^2 and RMSE values with according iteration number --
maxM = 200
maxR2 =     0.9096
minRMSE =     2.3478
-- TEST: the best R^2 and RMSE values with according iteration number --
maxM = 54
maxR2 =     0.8901
minRMSE =     2.5458
