In [None]:
# experimento com todas as imagens das Palmer para predição do tempo de cada amostra, usando CV 5

In [1]:
import cv2
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import os 
import math
import scipy

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit, KFold

%load_ext autoreload
plt.rcParams['figure.figsize'] = [20, 15]

path = os.getcwd()+'//..//imagens//'
sys.path.append(os.getcwd()+'//..//scripts//')
from MNG import MNG
from MNGFeatures import MNGFeatures
from MNGFeaturesMeans import MNGFeaturesMeans
from MNGFeaturesSize import MNGFeaturesSize
from MNGModel import MNGModel

In [118]:
indexes = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'media']
columns = ['MLR', 'RF100', 'RF200', 'RF300', 'RF400', 'RF500', 'RF600', 'RF700']
data_path = os.getcwd()+'/../resampling/all_data_synthetic.csv'
old_data = pd.read_csv(data_path, sep=',', index_col=0)

cv_num = 5

# ss = ShuffleSplit(n_splits=cv_num, test_size=0.2,random_state=0)
kf = KFold(n_splits=5, shuffle=True)

train = [[], [], [], [], []]
test = [[], [], [], [], []]

In [119]:
second_half_df = old_data.iloc[600:]

to_be_deleted = random.sample(range(0, 719), 120)
rows_to_delete = second_half_df.iloc[to_be_deleted]

In [91]:
# repeated
# second_half_df = old_data.iloc[600:]
# to_be_deleted = [616,538,436,590,401,583,239,263,285,317,670,634,467,249,287,210,492,119,607,442,706,
# 343,183,201,394,665,559,345,217,214,455,699,269,39,92,689,279,504,1,272,97,28,277,48,462,137,313,686,575,291,
# 496,673,487,311,589,315,50,171,105,247,181,190,426,178,546,235,540,551,157,26,281,156,206,658,399,257,698,402,
# 290,350,20,533,330,479,111,128,332,107,264,124,574,195,294,639,672,208,186,597,227,611,457,267,324,163,295,653,
# 702,682,130,408,640,268,96,8,386,705,471,429,405,387]
# rows_to_delete = second_half_df.iloc[to_be_deleted]

In [122]:
# synthetic
# second_half_df = old_data.iloc[600:]
# to_be_deleted = [453,564,77,177,110,248,26,589,459,389,681,610,489,676,467,187,184,223,305,494,390,137,302,208,
#                   207,227,577,244,568,472,532,545,16,271,393,30,369,473,340,99,611,477,388,468,365,580,23,64,50,
#                  66,566,344,696,199,517,224,624,232,688,338,349,356,44,439,716,83,641,274,252,622,143,446,179,265,
#                  36,113,277,551,391,701,192,614,637,538,3,669,91,569,586,392,43,4,615,134,124,375,457,310,625,300,
#                  450,562,575,709,594,169,95,226,128,231,87,229,519,81,127,315,194,667,220,294]
# rows_to_delete = second_half_df.iloc[to_be_deleted]

In [123]:
data = old_data.drop(rows_to_delete.index.values)

In [124]:
data.drop(columns=['sst','firmeza'], inplace=True)

In [125]:
def compute_models(data, target, trees=[100, 200, 300, 400, 500, 600, 700]):
    df_r2 = pd.DataFrame(index=indexes, columns=columns)
    df_rmse = pd.DataFrame(index=indexes, columns=columns)
    
    X = data.drop(columns=target)
    Y = data[target]

    i = 0
    for train_i, test_i in kf.split(X):
        train[i] = train_i
        test[i] = test_i
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        Y_train, Y_test = Y.iloc[train_i], Y.iloc[test_i]
    
        mlr_model = LinearRegression().fit(X_train, Y_train)
        Y_predicted = mlr_model.predict(X_test)
    
        r2 = r2_score(Y_test, Y_predicted)
        rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))
    
        df_r2['MLR'].iloc[i] = r2
        df_rmse['MLR'].iloc[i] = rmse
        
#         if type(trees) == int:
#             rf_model = RandomForestRegressor(n_estimators=trees).fit(X_train, Y_train)
#             Y_predicted = rf_model.predict(X_test)

#             r2 = r2_score(Y_test, Y_predicted)
#             rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))

#             df_r2['RF'+str(trees)].iloc[i] = r2 
#             df_rmse['RF'+str(trees)].iloc[i] = rmse
            
#             i = i + 1

        for tree in trees:
            rf_model = RandomForestRegressor(n_estimators=tree).fit(X_train, Y_train)
            Y_predicted = rf_model.predict(X_test)

            r2 = r2_score(Y_test, Y_predicted)
            rmse= math.sqrt(mean_squared_error(Y_test, Y_predicted))

            df_r2['RF'+str(tree)].iloc[i] = r2 
            df_rmse['RF'+str(tree)].iloc[i] = rmse 

        i = i + 1
    
    df_r2['MLR'].iloc[-1] = df_r2['MLR'].mean()
    df_rmse['MLR'].iloc[-1] = df_rmse['MLR'].mean()
    
    for tree in np.arange(100, 701, 100):
        df_r2['RF'+str(tree)].iloc[-1] = df_r2['RF'+str(tree)].mean()
        df_rmse['RF'+str(tree)].iloc[-1] = df_rmse['RF'+str(tree)].mean()
        
    return df_r2, df_rmse

In [126]:
# mean and size
cols = ['time', 'mean_R_full', 'mean_G_full', 'mean_B_full', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', \
           'mean_a_full', 'mean_b_full', 'area', 'diameter', 'height', 'width']
r2, rmse = compute_models(data[cols], 'time', [100])

In [129]:
# mean (A1 group)
cols = ['time', 'mean_R_full', 'mean_G_full', 'mean_B_full', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', \
           'mean_a_full']
r2, rmse = compute_models(data[cols], 'time', [100])

In [132]:
# size (A2 group)
cols = ['time', 'area', 'diameter', 'height', 'width']
r2, rmse = compute_models(data[cols], 'time', [100])

In [135]:
# dominant hsv (A3 group)
cols = ['time', 'dominant_HSV']
r2, rmse = compute_models(data[cols], 'time', [100])

In [138]:
# rates (A4 group)
cols = ['time', 'RG_rate', 'RB_rate', 'SH_rate']
r2, rmse = compute_models(data[cols], 'time', [100])

In [141]:
# A5 group
cols = ['time','RG_diff_full','RB_diff_full','GB_diff_full','apex_R','apex_G','apex_B','equator_R','equator_G','equator_B',\
        'stalk_R','stalk_G','stalk_B','apex_equator_R_diff','equator_stalk_R_diff','apex_stalk_R_diff','apex_equator_G_diff',\
        'equator_stalk_G_diff','apex_stalk_G_diff','apex_equator_B_diff','equator_stalk_B_diff','apex_stalk_B_diff', \
        'long_gradient', 'mean_R_full', 'mean_G_full', 'mean_B_full']
r2, rmse = compute_models(data[cols], 'time', [100])

In [144]:
# A6 group
cols = ['time', 'area']
r2, rmse = compute_models(data[cols], 'time', [100])

In [147]:
# A7 group
regions = ['region' in col for col in data.columns.values]
cols = data.columns.values[regions]
cols = np.append(cols, ['time'])

r2, rmse = compute_models(data[cols], 'time', [100])

In [150]:
# group A9
cols = ['time', 'mean_L_full', 'mean_a_full', 'mean_b_full', 'bcd', 'cd', 'dd']

r2, rmse = compute_models(data[cols], 'time', [100])

In [154]:
# group A10
cols = ['time', 'bcd', 'cd', 'dd']

r2, rmse = compute_models(data[cols], 'time', [100])

In [157]:
# group A11
cols = ['time', 'mean_H_full', 'mean_S_full', 'mean_V_full']

r2, rmse = compute_models(data[cols], 'time', [100])

In [160]:
# group A12
cols = ['time', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_L_full', 'mean_a_full', 'mean_b_full']

r2, rmse = compute_models(data[cols], 'time', [100])

In [163]:
# group A13
cols = ['time', 'mean_H_full', 'mean_S_full', 'mean_V_full', 'mean_R_full', 'mean_G_full', 'mean_B_full', 'RG_rate', 'RB_rate',\
       'SH_rate']

r2, rmse = compute_models(data[cols], 'time', [100])

In [166]:
# group A14
cols = ['time', 'mean_b_full', 'area', 'diameter']

r2, rmse = compute_models(data[cols], 'time', [100])

In [172]:
# regions (n=20), area, diameter, height, width
regions = ['region' in col for col in data.columns.values]
cols = data.columns.values[regions]
cols = np.append(cols, ['time', 'area', 'width', 'height', 'diameter'])

r2, rmse = compute_models(data[cols], 'time', [500])

In [169]:
# ALL - novo
cols = data.columns.values[1710:1748]
cols = np.append(cols, ['time'])

r2, rmse = compute_models(data[cols], 'time', [500])

In [175]:
# ALL + novo
r2, rmse = compute_models(data.drop(columns=data.columns.values[:1710]), 'time', [500])