In [4]:
# Fuzzy Gradient Boosting Regression Algorithm
# using different fuzzy distances
# by Nasiboglu R.,  Nasibov E.
# February 2022

from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import random as r
import pandas as pd
import math
from sklearn import preprocessing
    

# Center of Area defuzzification for Fuzzy Number
def COA(A):
    s=1 # shape convexity parameter 
    b=(A[0]+s*((A[0]-A[1])+(A[0]+A[2])))/(2*s+1)
    return b

# WABL defuzzification for Fuzzy Number
def WABL(A):
    c=0.5 # optimism parameter
    s=1   # shape convexity parameter 
    k=0   # increasing speed of level importances
    b=c*((A[0]+A[2])-((k+1)/(k+s+1))*A[2])+(1-c)*((A[0]-A[1])+((k+1)/(k+s+1))*A[1])
    return b

# fuzzy average of fuzzy numbers
def fuzAve(a):
    fuz=[0,0,0]
    for i in range(len(a)):
        fuz=fuzAdd(a[i],fuz)
    fuz=[fuz[0]/len(a),fuz[1],fuz[2]]    
    return fuz

# distance betwee fuzzy numbers
def fuzDist(a,b):
    # You can select the required distance by deleting the comment
    
    fuz=max(abs(a[0]-b[0]),abs(a[1]-b[1]),abs(a[2]-b[2])) # D1 distance
    #fuz=abs(a[0]-b[0])+max(abs(a[1]-b[1]),abs(a[2]-b[2])) # D2=D3 distance
    #fuz=abs(WABL(fuzSubtr(a,b))) # D4 distance
    return fuz

# Calculating of RMSE value according to fuzzy numbers
def fuzRMSE(a,b):
    fuz=0
    for i in range(len(a)):
        fuz+=fuzDist(a[i],b[i])**2
    fuz=math.sqrt(fuz/len(a))
    return fuz

# Calculating of MAE value according to fuzzy numbers
def fuzMAE(a,b):
    fuz=0
    for i in range(len(a)):
        fuz+=fuzDist(a[i],b[i])
    fuz=fuz/len(a)
    return fuz

# Calculating of R-squared value according to given average of fuzzy numbers
def fuzR2(a,b,ave):
    fuz1,fuz2=0,0
    for i in range(len(a)):
        fuz1+=fuzDist(a[i],b[i])**2
        fuz2+=fuzDist(a[i],ave)**2
    fuz=1-(fuz1/fuz2)
    return fuz

# fuzzy Subtraction A-B of fuzzy numbers
def fuzSubtr(a,b):
    fuz=[a[0]-b[0],max(a[1],b[1]),max(a[2],b[2])]
    #fuz=[a[0]-b[0],(a[1]+b[2]),(a[2]+b[1])]
    return fuz
             
# fuzzy Addition A+B of fuzzy numbers
def fuzAdd(a,b):
    fuz=[a[0]+b[0],max(a[1],b[1]),max(a[2],b[2])]
    #fuz=[a[0]+b[0],(a[1]+b[1]),(a[2]+b[2])]
    return fuz

# Multiplication of fuzzy number A by scalar b
def fuzMultBy(a,b):
    fuz=[b*a[0],a[1],a[2]]
    return fuz

"""
# Load the Diabetes Dataset
data=datasets.load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=42, test_size=0.1)
"""

# Load the Car Prices Dataset
df = pd.read_csv("cars1.csv",sep=";") 
data=df.to_numpy()
X=np.array([data[i][0:7] for i in range(len(data))]).tolist()
Y=np.array([data[i][7] for i in range(len(data))]).tolist()
le = preprocessing.OneHotEncoder(sparse=False)
X0=[X[i][:5] for i in range(len(X))]
X1=le.fit_transform(X0)
X3=[X[i][5:7] for i in range(len(X))]
X=np.c_[X1,X3]
Y=np.reshape(Y,(-1,1))
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.1)

print(len(X_train),len(X_test))
r.seed(0)

# fuzzy number A=(mode,l_width,r_width)
max_width=0.1  # max width of fuzzyness

y_fuz_train = np.c_[y_train,y_train*(max_width*r.random()),y_train*(max_width*r.random())]
y_fuz_test = np.c_[y_test,y_test*(max_width*r.random()),y_test*(max_width*r.random())]

# Standardize the inputs

sc = MinMaxScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# -------- Parameters ----------
M=201
learning_rate=0.1
tree_depth=1
max_leaf=2**tree_depth

F=[[[0,0,0] for _ in range(len(X_train))] for i in range(M)]
F[0]=[fuzAve(y_fuz_train) for _ in range(len(X_train))]

gamma=[[[0,0,0] for i in range(max_leaf)] for j in range(M)]
trees=[]

# boosting iterations
for m in range(1,M):    
    #print("------------------------------- Tree %d ----------------------------------" % (m))
    rrr=[fuzSubtr(y_fuz_train[i],F[m-1][i]) for i in range(len(y_fuz_train))]

    # stump tree is constructed up to centers or WABL values of FN
    r1=[rrr[i][0] for i in range(len(rrr))]
    #r1=[WABL(r[i]) for i in range(len(r))]
    
    tree = DecisionTreeRegressor(random_state=0,max_depth=tree_depth)
    tree.fit(X_train_std, r1)
    trees.append(tree)
    h=tree.apply(X_train_std)   
    
    # h1 is actual distinct leaf count 
    h1=list(set(h))

    for l in range(len(h1)):
        leaf_l=[j for j in range(len(r1)) if h[j]==h1[l]] 
        ss=[rrr[j] for j in leaf_l]
        ss1=np.reshape(ss,(-1,3))
        gamma[m][l]=fuzAve(ss1) #for each leaf node
        for k in leaf_l:
            F[m][k]=fuzAdd(F[m-1][k],fuzMultBy(gamma[m][l],learning_rate)) 

# prediction
print("----------- Train set fuzRMSE -------------------")
maxR2=-999999
maxM=-1

X1=X_train_std
fuzY=y_fuz_train

FM=F[0]  
ave=FM[0]  # average of train set

for m in range(1,M):
    h=trees[m-1].apply(X1)
    h1=list(set(h))
    for l in range(len(h1)):
        leaf_l=[j for j in range(len(X_train)) if h[j]==h1[l]] 
        for k in leaf_l:
            FF=fuzAdd(FM[k],fuzMultBy(gamma[m][l],learning_rate))
            FM[k]=FF    #for each xi of each leaf node 

    R2=fuzR2(fuzY,FM,ave)
    if R2>maxR2:
        maxR2=R2
        maxM=m
    if m%10==0:
        print(m,fuzRMSE(fuzY,FM))
    
print("-- the best R^2 value and according iteration number --")
print("maxM =",maxM)
print("maxR2 =",maxR2)

print("----------- Test set fuzRMSE -------------------")
maxR2=-999999
maxM=-1

X1=X_test_std
fuzY=y_fuz_test
F=[[[0,0,0] for _ in range(len(X_test))] for i in range(M)]
F[0]=[fuzAve(y_fuz_train) for _ in range(len(X_test))] # initial average of train set for all test set
FM=F[0] 
ave=FM[0]  # average of train set
for m in range(1,M):
    h=trees[m-1].apply(X1)
    h1=list(set(h))
    for l in range(len(h1)):
        leaf_l=[j for j in range(len(X1)) if h[j]==h1[l]] 
        for k in leaf_l:
            FF=fuzAdd(FM[k],fuzMultBy(gamma[m][l],learning_rate))
            FM[k]=FF    #for each xi of each leaf node 

    R2=fuzR2(fuzY,FM,ave)
    if R2>maxR2:
        maxR2=R2
        maxM=m
    if m%10==0:
        print(fuzRMSE(fuzY,FM))   

print("-- the best R^2 value and according iteration number --")
print("maxM =",maxM)
print("maxR2 =",maxR2)


513 58
----------- Train set fuzRMSE -------------------
10 22020.54520921666
20 17099.444806043888
30 15175.687560457658
40 14217.814214817852
50 13683.002309131394
60 13326.68888737387
70 13132.001860919696
80 12969.523329645579
90 12850.472255596767
100 12744.615629841494
110 12655.45342969611
120 12582.708774004617
130 12520.050398384341
140 12465.858039359744
150 12428.275954216162
160 12391.009944084672
170 12355.891799129142
180 12332.845818164866
190 12305.857341610546
200 12286.188724529837
-- the best R^2 value and according iteration number --
maxM = 200
maxR2 = 0.8786725966403968
----------- Test set fuzRMSE -------------------
26738.940464441326
20728.289070724488
17750.91213385651
16797.03529263182
16295.99609454776
16048.22189565059
15942.452954132217
15236.297323144001
15872.792002877719
16272.462559042198
16316.008352002249
17595.43292888127
19121.25771926156
19798.122148970106
21346.3394144354
21399.475067476847
22785.527976590423
22844.8991468387
22850.505344505214
2