In [None]:
# experimento com todas as imagens das Palmer para predição do tempo de cada amostra, usando CV 5

In [1]:
import cv2
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import math
import scipy

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit, KFold

%load_ext autoreload
plt.rcParams['figure.figsize'] = [20, 15]

path = os.getcwd()+'\\..\\imagens\\'
sys.path.append(os.getcwd()+'\\..\\scripts\\')
from MNG import MNG
from MNGFeatures import MNGFeatures
from MNGFeaturesMeans import MNGFeaturesMeans
from MNGFeaturesSize import MNGFeaturesSize
from MNGModel import MNGModel

In [2]:
indexes = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'media']
columns = ['MLR', 'RF100', 'RF200', 'RF300', 'RF400', 'RF500', 'RF600', 'RF700']

cv_num = 5

# ss = ShuffleSplit(n_splits=cv_num, test_size=0.2,random_state=0)
kf = KFold(n_splits=5, shuffle=True)

train = [[], [], [], [], []]
test = [[], [], [], [], []]

In [3]:
def compute_models(file_name):
    data = pd.read_csv(path+'\\..\\features\\' + file_name + '.csv', sep=';', index_col=0)

    df_r2 = pd.DataFrame(index=indexes, columns=columns)
    df_rmse = pd.DataFrame(index=indexes, columns=columns)
    
    X = data.drop(columns='time')
    Y = data['time']

    i = 0
    for train_i, test_i in kf.split(X):
        train[i] = train_i
        test[i] = test_i
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        Y_train, Y_test = Y.iloc[train_i], Y.iloc[test_i]
    
        mlr_model = LinearRegression().fit(X_train, Y_train)
        Y_predicted = mlr_model.predict(X_test)
    
        r2 = r2_score(Y_test, Y_predicted)
        rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))
    
        df_r2['MLR'].iloc[i] = r2
        df_rmse['MLR'].iloc[i] = rmse
        
        for tree in np.arange(100, 701, 100):
            rf_model = RandomForestRegressor(n_estimators=tree).fit(X_train, Y_train)
            Y_predicted = rf_model.predict(X_test)
            
            r2 = r2_score(Y_test, Y_predicted)
            rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))
            
            df_r2['RF'+str(tree)].iloc[i] = r2 
            df_rmse['RF'+str(tree)].iloc[i] = rmse 
    
        i = i + 1
    
    df_r2['MLR'].iloc[-1] = df_r2['MLR'].mean()
    df_rmse['MLR'].iloc[-1] = df_rmse['MLR'].mean()
    
    for tree in np.arange(100, 701, 100):
        df_r2['RF'+str(tree)].iloc[-1] = df_r2['RF'+str(tree)].mean()
        df_rmse['RF'+str(tree)].iloc[-1] = df_rmse['RF'+str(tree)].mean()
        
    return df_r2, df_rmse

In [104]:
# mean and size
file_name = 'means_size_all_half'
r2, rmse = compute_models(file_name)

In [107]:
# mean
file_name = 'means_all_half'
r2, rmse = compute_models(file_name)

In [110]:
# size
file_name = 'size_all_half'
r2, rmse = compute_models(file_name)

In [4]:
# dominant hsv
file_name = 'dominant_all_half'
r2, rmse = compute_models(file_name)

In [8]:
# rates
file_name = 'rates_all_half'
r2, rmse = compute_models(file_name)

In [10]:
rmse

Unnamed: 0,MLR,RF100,RF200,RF300,RF400,RF500,RF600,RF700
fold1,25.9464,19.4411,19.6186,19.3368,19.4382,19.4126,19.3475,19.4919
fold2,24.5288,17.453,18.1291,17.7571,17.927,17.6265,17.7965,17.6796
fold3,24.6114,19.4284,19.4639,19.4173,19.3929,19.4025,19.2762,19.4237
fold4,23.4155,21.7409,21.756,21.4027,21.5235,21.5311,21.4881,21.5456
fold5,23.8803,18.1412,17.9553,18.1732,18.0167,18.1648,18.103,17.9552
media,24.4764,19.2409,19.3846,19.2174,19.2597,19.2275,19.2023,19.2192
