In [None]:
import pandas as pd
import numpy as np
import sklearn
import warnings
import statistics

from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
files = ['/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4444.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy3444.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy2444.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy1444.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy0444.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4434.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4424.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4414.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4404.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4443.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4442.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4441.csv',
         '/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4440.csv']

In [3]:
def resultsPredictionKFold(X_train, y_train, cv, model):
    mae = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv)
    rmse = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv)
    r2 = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv)
    result = [r2, mae, rmse]
    return result

In [4]:
np.random.seed(10)
for filename in files:    
    src = os.path.join(dirname, filename)
    print()
    print(src)
    print()
    
    sita = src.split('.')[0][-4:]
    data = pd.read_csv(src, sep=";", skiprows=[0])
    
    if(sita == '4440'):
        col_names = ['Room', 'Zone', 'Date', 'Time', 'CO2', 'Temperature', 'Humidity', 'Brightness', 'Unnamed1', 'Occupancy', 'Unnamed2']
        data.columns = col_names
        data = data.drop(columns=['Unnamed1', 'Unnamed2'])
    else:
        col_names = ['Room', 'Zone', 'Date', 'Time', 'CO2', 'Temperature', 'Humidity', 'Brightness', 'Occupancy', 'Unnamed1']    
        data.columns = col_names
        data = data.drop(columns=['Unnamed1'])    

    data = data.replace(' deleted', 0)
    data = data.replace('deleted', 0)
    data = data.replace('1st Floor', 1)
    data = data.replace('4th Floor', 4)
    data = data.replace('USB', 0)
    
    data['Room'] = pd.to_numeric(data['Room'], errors='coerce')
    data['Zone'] = pd.to_numeric(data['Zone'], errors='coerce')
    data['Date'] = pd.to_numeric(data['Date'], errors='coerce')
    data['Time'] = pd.to_numeric(data['Time'], errors='coerce')
    data['CO2'] = pd.to_numeric(data['CO2'], errors='coerce')
    data['Temperature'] = pd.to_numeric(data['Temperature'], errors='coerce')
    data['Humidity'] = pd.to_numeric(data['Humidity'], errors='coerce')
    data['Brightness'] = pd.to_numeric(data['Brightness'], errors='coerce')
    data['Occupancy'] = pd.to_numeric(data['Occupancy'], errors='coerce')

    X = data[['Room', 'Zone', 'Date', 'Time', 'Temperature', 'Humidity', 'Brightness', 'Occupancy']]
    y = data['CO2']
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    cv = KFold(n_splits=10, shuffle=True)
    
    modelRF = RandomForestRegressor()
    modelRF.fit(X_train, y_train)
    y_pred = modelRF.predict(X_test)
    resultsRF_KFold = resultsPredictionKFold(X_train, y_train, cv, modelRF)
    
    print("Modelo RF")
    print()

    print("Métrica R2")
    print("Média: %.6f" %(abs(resultsRF_KFold[0].mean())))
    print("Desvio Padrão: %.6f" %abs(resultsRF_KFold[0].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsRF_KFold[0])))
    print()

    print("Métrica MAE")
    print("Média: %.6f" %abs(resultsRF_KFold[1].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsRF_KFold[1].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsRF_KFold[1])))
    print()

    print("Métrica RMSE")
    print("Média: %.6f" %abs(resultsRF_KFold[2].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsRF_KFold[2].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsRF_KFold[2])))
    print()
        
    modelRR = Ridge()
    modelRR.fit(X_train, y_train)
    y_pred = modelRR.predict(X_test)
    resultsRR_KFold = resultsPredictionKFold(X_train, y_train, cv, modelRR)

    print("Modelo RR")
    print()

    print("Métrica R2")
    print("Média: %.6f" %(abs(resultsRR_KFold[0].mean())))
    print("Desvio Padrão: %.6f" %abs(resultsRR_KFold[0].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsRR_KFold[0])))
    print()

    print("Métrica MAE")
    print("Média: %.6f" %abs(resultsRR_KFold[1].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsRR_KFold[1].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsRR_KFold[1])))
    print()

    print("Métrica RMSE")
    print("Média: %.6f" %abs(resultsRR_KFold[2].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsRR_KFold[2].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsRR_KFold[2])))
    print()
        
    modelLR = LinearRegression()
    modelLR.fit(X_train, y_train)
    y_pred = modelLR.predict(X_test)
    resultsLR_KFold = resultsPredictionKFold(X_train, y_train, cv, modelLR)

    print("Modelo LR")
    print()

    print("Métrica R2")
    print("Média: %.6f" %(abs(resultsLR_KFold[0].mean())))
    print("Desvio Padrão: %.6f" %abs(resultsLR_KFold[0].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsLR_KFold[0])))
    print()

    print("Métrica MAE")
    print("Média: %.6f" %abs(resultsLR_KFold[1].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsLR_KFold[1].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsLR_KFold[1])))
    print()

    print("Métrica RMSE")
    print("Média: %.6f" %abs(resultsLR_KFold[2].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsLR_KFold[2].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsLR_KFold[2])))
    print()
        
    modelDT = DecisionTreeRegressor()
    modelDT.fit(X_train, y_train)
    y_pred = modelDT.predict(X_test)
    resultsDT_KFold = resultsPredictionKFold(X_train, y_train, cv, modelDT)

    print("Modelo DT")
    print()
    
    print("Métrica R2")
    print("Média: %.6f" %(abs(resultsDT_KFold[0].mean())))
    print("Desvio Padrão: %.6f" %abs(resultsDT_KFold[0].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsDT_KFold[0])))
    print()

    print("Métrica MAE")
    print("Média: %.6f" %abs(resultsDT_KFold[1].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsDT_KFold[1].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsDT_KFold[1])))
    print()

    print("Métrica RMSE")
    print("Média: %.6f" %abs(resultsDT_KFold[2].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsDT_KFold[2].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsDT_KFold[2])))
    print()
        
    modelGBR = GradientBoostingRegressor()
    modelGBR.fit(X_train, y_train)
    y_pred = modelGBR.predict(X_test)
    resultsGBR_KFold = resultsPredictionKFold(X_train, y_train, cv, modelGBR)

    print("Modelo GBR")
    print()

    print("Métrica R2")
    print("Média: %.6f" %(abs(resultsGBR_KFold[0].mean())))
    print("Desvio Padrão: %.6f" %abs(resultsGBR_KFold[0].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsGBR_KFold[0])))
    print()
    
    print("Métrica MAE")
    print("Média: %.6f" %abs(resultsGBR_KFold[1].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsGBR_KFold[1].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsGBR_KFold[1])))
    print()

    print("Métrica RMSE")
    print("Média: %.6f" %abs(resultsGBR_KFold[2].mean()))
    print("Desvio Padrão: %.6f" %abs(resultsGBR_KFold[2].std()))
    print("Mediana: %.6f" %abs(statistics.median(resultsGBR_KFold[2])))
    print()


/kaggle/input/historical-data-occupancy-sita/historical_data_occupancy4444.csv

Modelo RF

Métrica R2
Média: 0.731242
Desvio Padrão: 0.006793
Mediana: 0.732633

Métrica MAE
Média: 32.473938
Desvio Padrão: 0.371416
Mediana: 32.403389

Métrica RMSE
Média: 62.993307
Desvio Padrão: 0.906687
Mediana: 63.153823

Modelo RR

Métrica R2
Média: 0.109082
Desvio Padrão: 0.006534
Mediana: 0.109407

Métrica MAE
Média: 87.617685
Desvio Padrão: 0.508531
Mediana: 87.693068

Métrica RMSE
Média: 114.805525
Desvio Padrão: 0.795343
Mediana: 114.757042

Modelo LR

Métrica R2
Média: 0.109183
Desvio Padrão: 0.005188
Mediana: 0.109502

Métrica MAE
Média: 87.617377
Desvio Padrão: 0.577020
Mediana: 87.690497

Métrica RMSE
Média: 114.808220
Desvio Padrão: 0.624546
Mediana: 114.613968

Modelo DT

Métrica R2
Média: 0.540267
Desvio Padrão: 0.014621
Mediana: 0.535306

Métrica MAE
Média: 34.310159
Desvio Padrão: 0.556021
Mediana: 34.413752

Métrica RMSE
Média: 83.447758
Desvio Padrão: 1.040187
Mediana: 83.352951

Mod