In [None]:
# experimento com todas as imagens das Palmer para predição do tempo de cada amostra, usando CV 5

In [11]:
import cv2
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import os 
import math
import scipy

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.feature_selection import RFE

%load_ext autoreload
plt.rcParams['figure.figsize'] = [20, 15]

path = os.getcwd()+'//..//imagens//'
sys.path.append(os.getcwd()+'//..//scripts//')
from MNG import MNG
from MNGFeatures import MNGFeatures
from MNGFeaturesMeans import MNGFeaturesMeans
from MNGFeaturesSize import MNGFeaturesSize
from MNGModel import MNGModel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
indexes = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'media']
columns = ['MLR', 'RF100', 'RF200', 'RF300', 'RF400', 'RF500']
data_path = os.getcwd()+'/../resampling/all_data_repeated.csv'
old_data = pd.read_csv(data_path, sep=',', index_col=0)

cv_num = 5

# ss = ShuffleSplit(n_splits=cv_num, test_size=0.2,random_state=0)
kf = KFold(n_splits=5, shuffle=True)

train = [[], [], [], [], []]
test = [[], [], [], [], []]

In [44]:
# # run this once
# repeated_list_indexes = np.arange(480)
# random.shuffle(repeated_list_indexes)
# to_be_deleted = repeated_list_indexes[:120].tolist()

In [3]:
# rows_to_delete
to_be_deleted = [234, 385, 248, 110, 378, 159, 131, 252, 9, 53, 46, 153, 307, 203, 414, 412, 438, 20, 208, 188, \
                 357, 315, 246, 219, 235, 456, 261, 51, 339, 415, 67, 373, 52, 335, 201, 95, 129, 200, 364, 81, \
                 172, 291, 64, 367, 71, 255, 263, 40, 268, 165, 313, 316, 416, 404, 167, 238, 475, 32, 202, 345, \
                 294, 139, 324, 83, 15, 86, 254, 419, 477, 447, 207, 94, 318, 329, 249, 250, 85, 186, 361, 170, \
                 270, 100, 421, 258, 96, 69, 397, 451, 22, 181, 17, 59, 467, 215, 128, 230, 93, 286, 461, 337, \
                 468, 18, 271, 214, 391, 383, 82, 325, 228, 442, 274, 231, 122, 426, 401, 141, 143, 244, 192, 77]

In [4]:
# ATTENTION! I'M ASSIGNING IT MANUALLY, NOT BASED IN INDEX
resampled_data = old_data.iloc[-480:]
rows_to_delete = resampled_data.iloc[to_be_deleted]

data1 = old_data.drop(rows_to_delete.index.values)

# remove atributos diff
data1 = data1.drop(columns=data1.columns.values[:1710])

In [5]:
n_var_names = list(filter(lambda x: 'region' in x, data1.columns.values))
data1 = data1.drop(columns=n_var_names)

In [77]:
att = 'sst'

atts = set(['sst','firmeza', 'acidez', 'ratio', 'massa', 'time'])
data = data1.drop(columns=list(atts.difference(set([att]))))

<span style="font-size:30px;color:red">execute this to run models without resampled data</span>

In [10]:
synthetic_indexes = list(filter(lambda x: 'repeated' in x, data1.index.values))
rows_to_delete = data1.loc[synthetic_indexes]
data = data1.drop(rows_to_delete.index.values)

In [8]:
def compute_models(data, target, trees=[100, 200, 300, 400, 500, 600, 700]):
    df_r2 = pd.DataFrame(index=indexes, columns=columns)
    df_rmse = pd.DataFrame(index=indexes, columns=columns)
    
    X = data.drop(columns=target)
    Y = data[target]

    i = 0
    for train_i, test_i in kf.split(X):
        train[i] = train_i
        test[i] = test_i
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        Y_train, Y_test = Y.iloc[train_i], Y.iloc[test_i]
    
        mlr_model = LinearRegression().fit(X_train, Y_train)
        Y_predicted = mlr_model.predict(X_test)
    
        r2 = r2_score(Y_test, Y_predicted)
        rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))
    
        df_r2['MLR'].iloc[i] = r2
        df_rmse['MLR'].iloc[i] = rmse

        for tree in trees:
            rf_model = RandomForestRegressor(n_estimators=tree).fit(X_train, Y_train)
            Y_predicted = rf_model.predict(X_test)

            r2 = r2_score(Y_test, Y_predicted)
            rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))

            df_r2['RF'+str(tree)].iloc[i] = r2 
            df_rmse['RF'+str(tree)].iloc[i] = rmse 

        i = i + 1
    
    df_r2['MLR'].iloc[-1] = df_r2['MLR'].mean()
    df_rmse['MLR'].iloc[-1] = df_rmse['MLR'].mean()
    
    for tree in trees:
        df_r2['RF'+str(tree)].iloc[-1] = df_r2['RF'+str(tree)].mean()
        df_rmse['RF'+str(tree)].iloc[-1] = df_rmse['RF'+str(tree)].mean()
        
    return df_r2, df_rmse

# RFE variable selection

In [109]:
# ALL - novo
r2, rmse = compute_models(data.drop(columns=['height', 'width']), att, [100])

In [110]:
estimator = RandomForestRegressor(n_estimators=100)

In [111]:
selector = RFE(estimator)

In [112]:
selector = selector.fit(data.drop(columns=[att, 'height', 'width']), data[att])

In [113]:
cols = data.drop(columns=[att, 'height', 'width']).columns.values

In [114]:
important_variables = selector.support_ * cols
important_variables = important_variables[important_variables != '']

In [115]:
important_variables

array(['RB_diff_full', 'RB_rate', 'RG_diff_full', 'RG_rate', 'SH_rate',
       'apex_R', 'apex_equator_R_diff', 'apex_stalk_B_diff',
       'apex_stalk_R_diff', 'area', 'cd', 'diameter', 'equator_B',
       'equator_stalk_R_diff', 'mean_B_full', 'mean_G_full',
       'mean_H_full', 'mean_a_full', 'stalk_B', 'stalk_R'], dtype=object)

In [106]:
att = 'time'

atts = set(['sst','firmeza', 'acidez', 'ratio', 'massa', 'time'])
data = data1.drop(columns=list(atts.difference(set([att]))))

In [107]:
cols = list(important_variables) + [att]
r2, rmse = compute_models(data[cols], att, [100])

In [108]:
r2.to_csv('../results/rfbestvar_%s_r2.csv' % att)
rmse.to_csv('../results/rfbestvar_%s_rmse.csv' % att)

In [19]:
# mean and size
cols = [att, 'mean_R_full', 'mean_G_full', 'mean_B_full', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', \
           'mean_a_full', 'mean_b_full', 'area', 'diameter', 'height', 'width']
r2, rmse = compute_models(data[cols], att, [100])

In [23]:
# mean (A1 group)
cols = [att, 'mean_R_full', 'mean_G_full', 'mean_B_full', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', \
           'mean_a_full']
r2, rmse = compute_models(data[cols], att, [100])

In [26]:
# dominant hsv (A3 group)
cols = [att, 'dominant_HSV']
r2, rmse = compute_models(data[cols], att, [100])

In [30]:
# A5 group
cols = [att,'RG_diff_full','RB_diff_full','GB_diff_full','apex_R','apex_G','apex_B','equator_R','equator_G','equator_B',\
        'stalk_R','stalk_G','stalk_B','apex_equator_R_diff','equator_stalk_R_diff','apex_stalk_R_diff','apex_equator_G_diff',\
        'equator_stalk_G_diff','apex_stalk_G_diff','apex_equator_B_diff','equator_stalk_B_diff','apex_stalk_B_diff', \
        'long_gradient', 'mean_R_full', 'mean_G_full', 'mean_B_full']
r2, rmse = compute_models(data[cols], att, [100])

In [34]:
# A6 group
cols = [att, 'area']
r2, rmse = compute_models(data[cols], att, [100])

In [37]:
# n regions
regions = ['region' in col for col in data.columns.values]
cols = data.columns.values[regions]
cols = np.append(cols, [att])

r2, rmse = compute_models(data[cols], att, [500])

In [19]:
att = 'firmeza'

In [20]:
# A8 group
cols = [att, 'mean_R_full', 'mean_G_full', 'mean_B_full']

r2, rmse = compute_models(data[cols], att, [100])

In [24]:
rmse

Unnamed: 0,MLR,RF100,RF200,RF300,RF400,RF500
fold1,37.9909,26.6008,,,,
fold2,38.2299,27.043,,,,
fold3,41.2087,26.772,,,,
fold4,40.6138,26.2011,,,,
fold5,41.2758,27.5923,,,,
media,39.8638,26.8419,,,,


In [115]:
# group A9
cols = [att, 'mean_L_full', 'mean_a_full', 'mean_b_full', 'bcd', 'cd', 'dd']

r2, rmse = compute_models(data[cols], att, [100])

In [117]:
rmse

Unnamed: 0,MLR,RF100,RF200,RF300,RF400,RF500
fold1,2.06773,1.1352,,,,
fold2,2.04806,1.04673,,,,
fold3,2.15984,1.09205,,,,
fold4,1.93836,0.94321,,,,
fold5,2.13936,0.874806,,,,
media,2.07067,1.0184,,,,


In [14]:
# group A11
cols = [att, 'mean_H_full']

r2, rmse = compute_models(data[cols], att, [100])

In [16]:
rmse

Unnamed: 0,MLR,RF100,RF200,RF300,RF400,RF500
fold1,4.58421,3.03097,,,,
fold2,4.60134,3.19806,,,,
fold3,4.32845,2.95948,,,,
fold4,4.41438,2.8888,,,,
fold5,4.43694,2.79781,,,,
media,4.47307,2.97502,,,,


In [50]:
# group A12
cols = [att, 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', 'mean_a_full', 'mean_b_full']

r2, rmse = compute_models(data[cols], att, [100])

In [53]:
# group A13
cols = [att, 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_R_full', 'mean_G_full', 'mean_B_full', 'RG_rate', 'RB_rate',\
       'SH_rate']

r2, rmse = compute_models(data[cols], att, [100])

In [56]:
# group A14
cols = [att, 'mean_b_full', 'area', 'diameter']

r2, rmse = compute_models(data[cols], att, [100])

In [106]:
cols = [att, 'bcd', 'cd', 'cd', 'mean_L_full', 'mean_a_full', 'mean_b_full']

r2, rmse = compute_models(data[cols], att, [100])

In [107]:
r2

Unnamed: 0,MLR,RF100,RF200,RF300,RF400,RF500
fold1,0.78789,0.958657,,,,
fold2,0.801515,0.941756,,,,
fold3,0.787255,0.922851,,,,
fold4,0.769927,0.955441,,,,
fold5,0.78554,0.950653,,,,
media,0.786426,0.945872,,,,


In [59]:
# regions (n=20), area, diameter, height, width
regions = ['region' in col for col in data.columns.values]
cols = data.columns.values[regions]
cols = np.append(cols, [att, 'area', 'width', 'height', 'diameter'])

r2, rmse = compute_models(data[cols], att, [500])

In [33]:
att = 'firmeza'

In [68]:
# ALL + novo
r2, rmse = compute_models(data, att, [500])