## Import Pipeline

In [1]:
import Pipelines

## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from colorama import Fore
from sklearn.ensemble import (RandomForestRegressor ,
                                HistGradientBoostingRegressor,
                                ExtraTreesRegressor)
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import r2_score, mean_absolute_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.simplefilter("ignore")

## Dataset

In [3]:
X_train = pd.read_csv('Datasets/Xtrain.csv',index_col='Unnamed: 0')
y_train = pd.read_csv('Datasets/ytrain.csv',index_col='Unnamed: 0')
X_test = pd.read_csv('Datasets/Xtest.csv',index_col='Unnamed: 0')
y_test = pd.read_csv('Datasets/ytest.csv',index_col='Unnamed: 0')



## Pipeline 2

In [4]:
test_set = pd.concat([X_test,y_test], axis=1)
X_train2, full_pipeline = Pipelines.Full_pipeline2(X_train)
X_test2 = Pipelines.Production_pipeline(test_set,full_pipeline)

X_train = pd.DataFrame(X_train2)
X_test = pd.DataFrame(X_test2)

In [5]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,-0.322817,-0.256632,1.239004,0.009541,0.612859,-0.212647,1.012192,0.452644,1.658105,0.374518,-0.326377,-0.183566,-0.623696,0.677302
1,1.0,0.103723,-0.265419,-0.104193,-0.183638,0.222202,-0.212647,-0.586831,0.238553,-0.683349,0.24959,-0.326377,-0.195855,0.688496,0.263062
2,1.0,1.689162,0.543056,-0.283628,-0.375135,0.13539,0.085843,-0.927688,0.281371,-0.695781,0.166304,1.082164,-0.168581,0.552752,0.226077
3,0.0,-0.837884,-0.265419,0.603292,3.824156,0.395827,-0.21046,0.921965,0.666734,1.38459,0.666017,-0.326377,-0.008125,-0.804688,1.318386
4,1.0,0.876322,-0.230268,-0.211854,-0.375135,0.178796,-0.199266,-0.591843,0.195735,-0.0783,0.207947,0.271771,-0.194763,0.507504,-1.02897


In [6]:
X_train.shape

(2342, 15)

In [7]:
X_test.shape

(586, 15)

# RandomForest

In [8]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

train_score = rf.predict(X_train)
train_r2 = r2_score(train_score,y_train)
train_mae = mean_absolute_error(train_score,y_train)


scores1 = cross_validate(rf, X_train, y_train, cv=5, scoring=['r2'])
scores2 = cross_validate(rf, X_train, y_train, cv=5, scoring=['neg_mean_absolute_error'])
avg_r2 = scores1['test_r2'].mean()
avg_mae = -scores2['test_neg_mean_absolute_error'].mean()

pred = rf.predict(X_test)
test_r2 = r2_score(pred,y_test)
test_mae = mean_absolute_error(pred,y_test)


dic = {'R2 (%)':[train_r2*100, avg_r2*100, test_r2*100],
       
       'MAE':[train_mae, avg_mae, test_mae] }

Evaluation = pd.DataFrame(dic, index=['Train', 'Avg_cv', 'Test'])
Evaluation.round(2)

Unnamed: 0,R2 (%),MAE
Train,99.45,0.44
Avg_cv,95.8,1.27
Test,96.11,1.13


========================================================================================================

## ExtraTreesRegressor

In [9]:
extra = ExtraTreesRegressor()
extra.fit(X_train,y_train)

train_score = extra.predict(X_train)
train_r2 = r2_score(train_score,y_train)
train_mae = mean_absolute_error(train_score,y_train)


scores1 = cross_validate(extra, X_train, y_train, cv=5, scoring=['r2'])
scores2 = cross_validate(extra, X_train, y_train, cv=5, scoring=['neg_mean_absolute_error'])
avg_r2 = scores1['test_r2'].mean()
avg_mae = -scores2['test_neg_mean_absolute_error'].mean()

pred = extra.predict(X_test)
test_r2 = r2_score(pred,y_test)
test_mae = mean_absolute_error(pred,y_test)


dic = {'R2 (%)':[train_r2*100, avg_r2*100, test_r2*100], 
       'MAE':[train_mae, avg_mae, test_mae] }

Evaluation = pd.DataFrame(dic, index=['Train', 'Avg_cv', 'Test'])
Evaluation.round(2)

Unnamed: 0,R2 (%),MAE
Train,100.0,0.0
Avg_cv,96.45,1.08
Test,96.55,0.96


========================================================================================================

## LGBMRegressor

In [10]:
lg = LGBMRegressor()
lg.fit(X_train,y_train)

train_score = lg.predict(X_train)
train_r2 = r2_score(train_score,y_train)
train_mae = mean_absolute_error(train_score,y_train)


scores1 = cross_validate(lg, X_train, y_train, cv=5, scoring=['r2'])
scores2 = cross_validate(lg, X_train, y_train, cv=5, scoring=['neg_mean_absolute_error'])
avg_r2 = scores1['test_r2'].mean()
avg_mae = -scores2['test_neg_mean_absolute_error'].mean()

pred = lg.predict(X_test)
test_r2 = r2_score(pred,y_test)
test_mae = mean_absolute_error(pred,y_test)


dic = {'R2 (%)':[train_r2*100, avg_r2*100, test_r2*100],
       
       'MAE':[train_mae, avg_mae, test_mae] }

Evaluation = pd.DataFrame(dic, index=['Train', 'Avg_cv', 'Test'])
Evaluation.round(2)

Unnamed: 0,R2 (%),MAE
Train,98.96,0.65
Avg_cv,95.73,1.3
Test,96.29,1.19


In [11]:
xg = XGBRegressor()
xg.fit(X_train,y_train)

train_score = xg.predict(X_train)
train_r2 = r2_score(train_score,y_train)
train_mae = mean_absolute_error(train_score,y_train)


scores1 = cross_validate(xg, X_train, y_train, cv=5, scoring=['r2'])
scores2 = cross_validate(xg, X_train, y_train, cv=5, scoring=['neg_mean_absolute_error'])
avg_r2 = scores1['test_r2'].mean()
avg_mae = -scores2['test_neg_mean_absolute_error'].mean()

pred = xg.predict(X_test)
test_r2 = r2_score(pred,y_test)
test_mae = mean_absolute_error(pred,y_test)


dic = {'R2 (%)':[train_r2*100, avg_r2*100, test_r2*100],
       
       'MAE':[train_mae, avg_mae, test_mae] }

Evaluation = pd.DataFrame(dic, index=['Train', 'Avg_cv', 'Test'])
Evaluation.round(2)

Unnamed: 0,R2 (%),MAE
Train,99.88,0.23
Avg_cv,95.34,1.37
Test,95.99,1.23


## There are little overfitting of previous Algorithms but I think the best of them is LGBM Regressor