In [1]:
import os
import time

#data manipulation
import numpy as np
import pandas as pd

#learning methods
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression

#Splitting the data
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

#Evaluation methods
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


from sklearn.model_selection import cross_val_score

In [8]:
def feed_data(reg, X, y):
#     data = pd.read_csv("final.csv")
#     y = data['Gb2t_avg']
#     X = data.drop('Gb2t_avg', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    reg = reg.fit(X_train, y_train)
    predictions = reg.predict(X_test)
    
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    return mse, mae, r2

def experiment_report(X, y):
    
    file_address = 'log_fold.txt'
    if os.path.exists(file_address):
        os.remove(file_address)
        f = open(file_address, "w+")
    else:
        f = open(file_address, "w+")

    regressors = [
        ExtraTreesRegressor(),
        RandomForestRegressor(),
        GradientBoostingRegressor(),
        AdaBoostRegressor(),
        KNeighborsRegressor(),
        DecisionTreeRegressor(),
        BaggingRegressor(ExtraTreesRegressor()),
        LinearRegression(),
        MLPRegressor(),
    ]

    for reg in regressors:
        start = time.time()
        mse, mae, r2 = feed_data(reg, X, y)
        time_taken=(time.time()-start)
        f.write("Regressor Name: {}\n".format(reg.__class__.__name__))
        f.write("MSE: {}\n".format(mse))
        f.write("MAE: {}\n".format(mae))
        f.write("R2: {}\n".format(r2))
        f.write("Time: {}\n".format(time_taken))
        f.write("##############################\n")

In [3]:
#BACKWARD ELIMINATION

In [2]:
data = pd.read_csv("final.csv")
y = data['Gb2t_avg']
X = data.drop('Gb2t_avg', axis=1)

In [9]:
model = RandomForestRegressor()#Initializing RFE model
rfe = RFE(model, 7)#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  #Fitting the data to model
model.fit(X_rfe,y)
print(rfe.support_)
print(rfe.ranking_)



[False False False  True False  True False  True False  True False False
  True  True False False False False False False False False False False
 False False False False  True]
[16  4  3  1 19  1 17  1 11  1  8  5  1  1 12  2  9 21 23  7 20 15 18 22
 13 10 14  6  1]


In [3]:
# 3, 5, 7, 9, 12, 13, 28
filter_list = ['Gost_avg', 'P_avg', 'Rm_avg', 'Ds_avg', 'Cm_avg', 'DCs_avg', 'Gb1t_avg']
# X.info()

In [4]:
X = X.filter(filter_list, axis=1)
X.head(5)

Unnamed: 0,Gost_avg,P_avg,Rm_avg,Ds_avg,Cm_avg,DCs_avg,Gb1t_avg
0,54.080002,1072.65,5690.2798,1799.91,5723.1201,1800.99,64.529999
1,56.66,1061.4301,5632.7202,1798.88,5650.5298,1799.97,68.589996
2,58.419998,1144.79,6071.6899,1800.09,6083.5298,1801.1899,71.0
3,59.459999,1183.98,6279.4702,1800.0601,6296.46,1801.14,72.410004
4,58.029999,1317.55,6988.5698,1799.9301,7024.8198,1801.01,69.260002


In [9]:
experiment_report(X,y)

