In [None]:
# experimento com todas as imagens das Palmer para predição do tempo de cada amostra, usando CV 5

In [25]:
import cv2
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import math
import scipy

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit, KFold

%load_ext autoreload
plt.rcParams['figure.figsize'] = [20, 15]

path = os.getcwd()+'\\..\\imagens\\'
sys.path.append(os.getcwd()+'\\..\\scripts\\')
from MNG import MNG
from MNGFeatures import MNGFeatures
from MNGFeaturesMeans import MNGFeaturesMeans
from MNGFeaturesSize import MNGFeaturesSize
from MNGModel import MNGModel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
indexes = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'media']
columns = ['MLR', 'RF100', 'RF200', 'RF300', 'RF400', 'RF500', 'RF600', 'RF700']

cv_num = 5

# ss = ShuffleSplit(n_splits=cv_num, test_size=0.2,random_state=0)
kf = KFold(n_splits=5, shuffle=True)

train = [[], [], [], [], []]
test = [[], [], [], [], []]

In [102]:
def compute_models(file_name):
    data = pd.read_csv(path+'\\..\\features\\' + file_name + '.csv', sep=';', index_col=0)

    df_r2 = pd.DataFrame(index=indexes, columns=columns)
    df_rmse = pd.DataFrame(index=indexes, columns=columns)
    
    X = data.drop(columns='time')
    Y = data['time']

    i = 0
    for train_i, test_i in kf.split(X):
        train[i] = train_i
        test[i] = test_i
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        Y_train, Y_test = Y.iloc[train_i], Y.iloc[test_i]
    
        mlr_model = LinearRegression().fit(X_train, Y_train)
        Y_predicted = mlr_model.predict(X_test)
    
        r2 = r2_score(Y_test, Y_predicted)
        rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))
    
        df_r2['MLR'].iloc[i] = r2
        df_rmse['MLR'].iloc[i] = rmse
        
        for tree in np.arange(100, 701, 100):
            rf_model = RandomForestRegressor(n_estimators=tree).fit(X_train, Y_train)
            Y_predicted = rf_model.predict(X_test)
            
            r2 = r2_score(Y_test, Y_predicted)
            rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))
            
            df_r2['RF'+str(tree)].iloc[i] = r2 
            df_rmse['RF'+str(tree)].iloc[i] = rmse 
    
        i = i + 1
    
    df_r2['MLR'].iloc[-1] = df_r2['MLR'].mean()
    df_rmse['MLR'].iloc[-1] = df_rmse['MLR'].mean()
    
    for tree in np.arange(100, 701, 100):
        df_r2['RF'+str(tree)].iloc[-1] = df_r2['RF'+str(tree)].mean()
        df_rmse['RF'+str(tree)].iloc[-1] = df_rmse['RF'+str(tree)].mean()
        
    return df_r2, df_rmse

In [104]:
# mean and size
file_name = 'means_size_all_half'
r2, rmse = compute_models(file_name)

In [107]:
# mean
file_name = 'means_all_half'
r2, rmse = compute_models(file_name)

In [110]:
# size
file_name = 'size_all_half'
r2, rmse = compute_models(file_name)

In [112]:
rmse

Unnamed: 0,MLR,RF100,RF200,RF300,RF400,RF500,RF600,RF700
fold1,28.5811,29.1801,29.7449,29.6125,29.7152,29.6648,29.8086,29.6997
fold2,24.7651,25.3168,25.097,25.1071,25.3384,25.4552,25.0294,25.0295
fold3,27.272,26.2348,25.9584,26.1147,26.1616,25.9559,26.1182,25.8634
fold4,27.2636,28.1155,28.7566,28.4943,28.7538,28.523,28.8078,28.5498
fold5,26.3562,27.6395,27.5142,27.9129,27.5796,27.6041,27.597,27.5943
media,26.8476,27.2973,27.4142,27.4483,27.5097,27.4406,27.4722,27.3473
