In [None]:
# experimento com todas as imagens das Palmer para predição do tempo de cada amostra, usando CV 5

In [1]:
import cv2
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import os 
import math
import scipy

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit, KFold

%load_ext autoreload
plt.rcParams['figure.figsize'] = [20, 15]

path = os.getcwd()+'//..//imagens//'
sys.path.append(os.getcwd()+'//..//scripts//')
from MNG import MNG
from MNGFeatures import MNGFeatures
from MNGFeaturesMeans import MNGFeaturesMeans
from MNGFeaturesSize import MNGFeaturesSize
from MNGModel import MNGModel

In [316]:
indexes = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'media']
columns = ['MLR', 'RF100', 'RF200', 'RF300', 'RF400', 'RF500']
data_path = os.getcwd()+'/../resampling/all_data_repeated.csv'
old_data = pd.read_csv(data_path, sep=',', index_col=0)

cv_num = 5

# ss = ShuffleSplit(n_splits=cv_num, test_size=0.2,random_state=0)
kf = KFold(n_splits=5, shuffle=True)

train = [[], [], [], [], []]
test = [[], [], [], [], []]

In [275]:
# # run this once
# repeated_list_indexes = np.arange(480)
# random.shuffle(repeated_list_indexes)
# to_be_deleted = repeated_list_indexes[:120].tolist()

In [317]:
# rows_to_delete
to_be_deleted = [234, 385, 248, 110, 378, 159, 131, 252, 9, 53, 46, 153, 307, 203, 414, 412, 438, 20, 208, 188, \
                 357, 315, 246, 219, 235, 456, 261, 51, 339, 415, 67, 373, 52, 335, 201, 95, 129, 200, 364, 81, \
                 172, 291, 64, 367, 71, 255, 263, 40, 268, 165, 313, 316, 416, 404, 167, 238, 475, 32, 202, 345, \
                 294, 139, 324, 83, 15, 86, 254, 419, 477, 447, 207, 94, 318, 329, 249, 250, 85, 186, 361, 170, \
                 270, 100, 421, 258, 96, 69, 397, 451, 22, 181, 17, 59, 467, 215, 128, 230, 93, 286, 461, 337, \
                 468, 18, 271, 214, 391, 383, 82, 325, 228, 442, 274, 231, 122, 426, 401, 141, 143, 244, 192, 77]

In [318]:
# ATTENTION! I'M ASSIGNING IT MANUALLY, NOT BASED IN INDEX
resampled_data = old_data.iloc[-480:]
rows_to_delete = resampled_data.iloc[to_be_deleted]

data = old_data.drop(rows_to_delete.index.values)

# remove atributos diff
data = data.drop(columns=data.columns.values[:1710])

In [319]:
data.drop(columns=['sst','firmeza', 'acidez', 'ratio', 'massa'], inplace=True)

In [324]:
# execute this to run models without resampled data
synthetic_indexes = list(filter(lambda x: 'repeated' in x, data.index.values))
rows_to_delete = data.loc[synthetic_indexes]
data = data.drop(rows_to_delete.index.values)

In [320]:
def compute_models(data, target, trees=[100, 200, 300, 400, 500, 600, 700]):
    df_r2 = pd.DataFrame(index=indexes, columns=columns)
    df_rmse = pd.DataFrame(index=indexes, columns=columns)
    
    X = data.drop(columns=target)
    Y = data[target]

    i = 0
    for train_i, test_i in kf.split(X):
        train[i] = train_i
        test[i] = test_i
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        Y_train, Y_test = Y.iloc[train_i], Y.iloc[test_i]
    
        mlr_model = LinearRegression().fit(X_train, Y_train)
        Y_predicted = mlr_model.predict(X_test)
    
        r2 = r2_score(Y_test, Y_predicted)
        rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))
    
        df_r2['MLR'].iloc[i] = r2
        df_rmse['MLR'].iloc[i] = rmse
        
#         if type(trees) == int:
#             rf_model = RandomForestRegressor(n_estimators=trees).fit(X_train, Y_train)
#             Y_predicted = rf_model.predict(X_test)

#             r2 = r2_score(Y_test, Y_predicted)
#             rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))

#             df_r2['RF'+str(trees)].iloc[i] = r2 
#             df_rmse['RF'+str(trees)].iloc[i] = rmse
            
#             i = i + 1

        for tree in trees:
            rf_model = RandomForestRegressor(n_estimators=tree).fit(X_train, Y_train)
            Y_predicted = rf_model.predict(X_test)

            r2 = r2_score(Y_test, Y_predicted)
            rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))

            df_r2['RF'+str(tree)].iloc[i] = r2 
            df_rmse['RF'+str(tree)].iloc[i] = rmse 

        i = i + 1
    
    df_r2['MLR'].iloc[-1] = df_r2['MLR'].mean()
    df_rmse['MLR'].iloc[-1] = df_rmse['MLR'].mean()
    
    for tree in trees:
        df_r2['RF'+str(tree)].iloc[-1] = df_r2['RF'+str(tree)].mean()
        df_rmse['RF'+str(tree)].iloc[-1] = df_rmse['RF'+str(tree)].mean()
        
    return df_r2, df_rmse

In [211]:
# mean and size
cols = ['time', 'mean_R_full', 'mean_G_full', 'mean_B_full', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', \
           'mean_a_full', 'mean_b_full', 'area', 'diameter', 'height', 'width']
r2, rmse = compute_models(data[cols], 'time', [100])

In [214]:
# mean (A1 group)
cols = ['time', 'mean_R_full', 'mean_G_full', 'mean_B_full', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', \
           'mean_a_full']
r2, rmse = compute_models(data[cols], 'time', [100])

In [217]:
# dominant hsv (A3 group)
cols = ['time', 'dominant_HSV']
r2, rmse = compute_models(data[cols], 'time', [100])

In [220]:
# A5 group
cols = ['time','RG_diff_full','RB_diff_full','GB_diff_full','apex_R','apex_G','apex_B','equator_R','equator_G','equator_B',\
        'stalk_R','stalk_G','stalk_B','apex_equator_R_diff','equator_stalk_R_diff','apex_stalk_R_diff','apex_equator_G_diff',\
        'equator_stalk_G_diff','apex_stalk_G_diff','apex_equator_B_diff','equator_stalk_B_diff','apex_stalk_B_diff', \
        'long_gradient', 'mean_R_full', 'mean_G_full', 'mean_B_full']
r2, rmse = compute_models(data[cols], 'time', [100])

In [223]:
# A6 group
cols = ['time', 'area']
r2, rmse = compute_models(data[cols], 'time', [100])

In [226]:
# n regions
regions = ['region' in col for col in data.columns.values]
cols = data.columns.values[regions]
cols = np.append(cols, ['time'])

r2, rmse = compute_models(data[cols], 'time', [500])

In [229]:
# A8 group
cols = ['time', 'mean_R_full', 'mean_G_full', 'mean_B_full']

r2, rmse = compute_models(data[cols], 'time', [100])

In [232]:
# group A9
cols = ['time', 'mean_L_full', 'mean_a_full', 'mean_b_full', 'bcd', 'cd', 'dd']

r2, rmse = compute_models(data[cols], 'time', [100])

In [235]:
# group A11
cols = ['time', 'mean_H_full', 'mean_S_full', 'mean_V_full']

r2, rmse = compute_models(data[cols], 'time', [100])

In [238]:
# group A12
cols = ['time', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', 'mean_a_full', 'mean_b_full']

r2, rmse = compute_models(data[cols], 'time', [100])

In [241]:
# group A13
cols = ['time', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_R_full', 'mean_G_full', 'mean_B_full', 'RG_rate', 'RB_rate',\
       'SH_rate']

r2, rmse = compute_models(data[cols], 'time', [100])

In [244]:
# group A14
cols = ['time', 'mean_b_full', 'area', 'diameter']

r2, rmse = compute_models(data[cols], 'time', [100])

In [247]:
# regions (n=20), area, diameter, height, width
regions = ['region' in col for col in data.columns.values]
cols = data.columns.values[regions]
cols = np.append(cols, ['time', 'area', 'width', 'height', 'diameter'])

r2, rmse = compute_models(data[cols], 'time', [500])

In [326]:
# ALL - novo
cols = data.columns.values[:38]
cols = np.append(cols, ['time'])
cols = np.delete(cols, np.where(cols == 'height'))

r2, rmse = compute_models(data[cols], 'time', [100])

In [321]:
# ALL + novo
r2, rmse = compute_models(data, 'time', [500])

In [323]:
rmse

Unnamed: 0,MLR,RF100,RF200,RF300,RF400,RF500
fold1,10.5589,,,,,7.66716
fold2,11.234,,,,,7.98893
fold3,10.0299,,,,,6.79662
fold4,11.3242,,,,,12.1821
fold5,12.7023,,,,,8.60973
media,11.1699,,,,,8.64891
