In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [1]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import pandas as pd
from datetime import datetime

In [2]:
def prepara_dataset(dataset, verbose=True):
    ids = dataset['id'].copy()
    dataset = dataset.drop(['id'], axis=1)
    dataset['date'] = dataset['date'].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d").day)

    for column in dataset:
        num_na = dataset[column].isnull().sum()
        if num_na > 0:
            placeholder = dataset[column].median()
            if np.isnan(placeholder):
                if verbose:
                    print(
                        'Empty column {:12s}... Filling with zero'.format(column))
                placeholder = 0
            dataset[column] = dataset[column].fillna(placeholder)

    for column in dataset:
        num_na = dataset[column].isnull().sum()
        assert num_na == 0
    return dataset, ids


In [3]:
def separa_dataset(dataset, com_date=True):
    cols_input = []
    cols_output = []
    for col in dataset.columns:
        if col.startswith("input") or (col.startswith('date') and com_date):
            cols_input.append(col)
        elif col.startswith("output"):
            cols_output.append(col)
        else:
            print("Unexpected column name:", col)
            continue

    inputs = dataset[cols_input]
    outputs = dataset[cols_output]
    return inputs, outputs

In [4]:
preprocess_folder = 'KaggleDatasets/PRE/'
result_folder = 'KaggleDatasets/result/'
filename = '2019result'

drive_path = '/content/drive/My Drive/'
filename2018 = 'KaggleDatasets/PRE/preprocessado2018.csv'
filename2019 = 'KaggleDatasets/PRE/preprocessado2019.csv'

dataset2018 = pd.read_csv(filename2018)
dataset2019raw = pd.read_csv(filename2019)

In [5]:
output_columns = dataset2018.columns[dataset2018.columns.str.contains("output")]

In [6]:
# Preprocess 2018 output dataset
inputs, outputs = separa_dataset(dataset2018)

# Parâmetros
x = 55 # x%
y = 35 # y%
z = 20 # z%

total = len(outputs)
median_list = []

variance = outputs.var()
mean = outputs.mean()

for column in outputs:
    num_na = outputs[column].isnull().sum()
    if num_na > 0:
        median_list.append(column)
        
median = outputs.median()
for column in median_list:
    outputs[column] = outputs[column].fillna(median[column])

Unexpected column name: id


In [9]:
# Separate data
faction_of_2018data_to_use = 0.5

inputs18 = inputs.sample(frac=faction_of_2018data_to_use)
outputs18 = outputs.sample(frac=faction_of_2018data_to_use)

X_train, X_val, Y_train, Y_val = train_test_split(
        inputs18, outputs18, test_size=0.30, random_state=57)

dataset2019, outputs19 = separa_dataset(dataset2019raw)

print("Input:", inputs18)
print("Output:", outputs18)

Unexpected column name: id
Input:        date  input_0  input_1  input_2  input_3  input_4_1  input_5_1  \
281       3      110        1      122      114   0.185172   1.702358   
6120      6      169        0       25       37   0.578330  -0.141271   
26154     4      231        2       91       91  -0.688511  -0.213180   
11793     0       89        1       30       39   0.017425  -0.211437   
25887     1      158        3      197       11  -0.767143  -0.352420   
...     ...      ...      ...      ...      ...        ...        ...   
3918      6      282        2       48       54  -0.471838  -0.021371   
11845     1       20        1      180      140  -0.326806  -0.158749   
5986      4      201        2      145      133  -0.782869  -0.021371   
18780     3      219        0       68       73   1.013425  -0.015110   
1678      6      297        2      116      109  -0.421164  -0.213180   

       input_6_1  input_7_1  input_8_1  ...  input_547_14  input_548_14  \
281    -0.7242

In [7]:
# define model
model = RandomForestRegressor(n_estimators=10, n_jobs=4)

In [10]:
# fit model
model.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [11]:
predict2018 = model.predict(X_val)
model.score(X_val, Y_val)
#predict2018.shape

-0.1388681451527942

In [15]:
# Predict 2019 results and save them
predict2019 = model.predict(dataset2019)
predict2019
#predict2019.to_csv(result_folder + 'RF_predict2019.csv', index=False)

array([[-0.26043934,  0.33311974,  0.53841073, ...,  0.04683439,
         0.72145411,  0.70676774],
       [-0.12010732, -0.10946814, -0.17165332, ..., -0.13362011,
         0.25587856,  0.38767853],
       [ 0.17320764,  0.46305256,  0.69531852, ...,  0.43644576,
         0.80717935, -0.30035036],
       ...,
       [-0.1813579 ,  0.30747917,  0.65930696, ...,  0.08659273,
         0.53209002,  0.3846695 ],
       [-0.10015622,  0.30296389,  0.20936658, ..., -0.21260475,
        -0.10770408, -0.29503315],
       [-0.19101714,  0.05167626,  0.02075691, ..., -0.06039921,
         0.12652669,  0.02754792]])

In [16]:
# Prepare Submission
df_pred_sub = pd.DataFrame(predict2019)
df_pred_sub

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
0,-0.260439,0.333120,0.538411,0.473605,0.289199,0.193781,0.522744,0.637084,-0.162097,0.285323,...,0.724050,0.016658,0.248102,0.409952,0.214199,0.689965,-0.138431,0.046834,0.721454,0.706768
1,-0.120107,-0.109468,-0.171653,0.057761,-0.063709,-0.328687,-0.327435,-0.429576,-0.213457,-0.093378,...,0.422193,-0.011888,0.287477,-0.026305,0.071464,0.187786,-0.144245,-0.133620,0.255879,0.387679
2,0.173208,0.463053,0.695319,0.276558,0.039522,0.652164,0.338811,0.003624,0.330115,0.515108,...,0.680934,0.390526,-0.024801,0.602218,0.758163,0.308021,-0.135471,0.436446,0.807179,-0.300350
3,0.058329,0.252301,0.556719,0.221581,-0.057954,0.653910,0.327469,0.949982,0.927877,0.365948,...,0.769278,0.295272,-0.228161,0.463388,0.597859,0.522505,0.426436,0.102488,0.529745,0.638336
4,0.019049,0.279683,0.368260,0.008982,-0.065971,0.108110,0.195401,0.541348,-0.067683,0.379882,...,0.168087,0.084257,-0.080668,0.204948,0.296501,0.239310,-0.099085,0.329299,0.355689,-0.158616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13755,0.397339,-0.188434,-0.199843,0.052423,-0.126861,-0.198821,-0.248082,0.141014,-0.069561,-0.076821,...,0.275540,0.152044,-0.099605,0.063135,0.169853,-0.094014,-0.113149,0.032179,0.126445,-0.211655
13756,0.135542,0.867967,0.872893,1.063816,-0.014657,0.027832,0.341597,0.254839,0.127410,0.839016,...,0.226579,0.752256,0.005213,0.889549,0.630684,0.459498,-0.150111,0.812660,0.414226,-0.145001
13757,-0.181358,0.307479,0.659307,0.256488,-0.050855,0.293227,-0.023348,-0.016889,-0.115727,0.512763,...,0.281408,-0.241682,-0.190725,0.074815,0.377281,0.391405,-0.166385,0.086593,0.532090,0.384669
13758,-0.100156,0.302964,0.209367,0.254602,0.214935,0.289533,0.283830,0.637319,0.038190,0.251040,...,-0.165787,0.083103,-0.235114,0.192168,0.331576,0.138626,0.442205,-0.212605,-0.107704,-0.295033


In [17]:
df_pred_sub.columns = output_columns
df_pred_sub

Unnamed: 0,output_1_0,output_2_0,output_3_0,output_4_0,output_5_0,output_6_0,output_7_0,output_8_0,output_9_0,output_10_0,...,output_7_6,output_8_6,output_9_6,output_10_6,output_11_6,output_12_6,output_13_6,output_14_6,output_15_6,output_16_6
0,-0.260439,0.333120,0.538411,0.473605,0.289199,0.193781,0.522744,0.637084,-0.162097,0.285323,...,0.724050,0.016658,0.248102,0.409952,0.214199,0.689965,-0.138431,0.046834,0.721454,0.706768
1,-0.120107,-0.109468,-0.171653,0.057761,-0.063709,-0.328687,-0.327435,-0.429576,-0.213457,-0.093378,...,0.422193,-0.011888,0.287477,-0.026305,0.071464,0.187786,-0.144245,-0.133620,0.255879,0.387679
2,0.173208,0.463053,0.695319,0.276558,0.039522,0.652164,0.338811,0.003624,0.330115,0.515108,...,0.680934,0.390526,-0.024801,0.602218,0.758163,0.308021,-0.135471,0.436446,0.807179,-0.300350
3,0.058329,0.252301,0.556719,0.221581,-0.057954,0.653910,0.327469,0.949982,0.927877,0.365948,...,0.769278,0.295272,-0.228161,0.463388,0.597859,0.522505,0.426436,0.102488,0.529745,0.638336
4,0.019049,0.279683,0.368260,0.008982,-0.065971,0.108110,0.195401,0.541348,-0.067683,0.379882,...,0.168087,0.084257,-0.080668,0.204948,0.296501,0.239310,-0.099085,0.329299,0.355689,-0.158616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13755,0.397339,-0.188434,-0.199843,0.052423,-0.126861,-0.198821,-0.248082,0.141014,-0.069561,-0.076821,...,0.275540,0.152044,-0.099605,0.063135,0.169853,-0.094014,-0.113149,0.032179,0.126445,-0.211655
13756,0.135542,0.867967,0.872893,1.063816,-0.014657,0.027832,0.341597,0.254839,0.127410,0.839016,...,0.226579,0.752256,0.005213,0.889549,0.630684,0.459498,-0.150111,0.812660,0.414226,-0.145001
13757,-0.181358,0.307479,0.659307,0.256488,-0.050855,0.293227,-0.023348,-0.016889,-0.115727,0.512763,...,0.281408,-0.241682,-0.190725,0.074815,0.377281,0.391405,-0.166385,0.086593,0.532090,0.384669
13758,-0.100156,0.302964,0.209367,0.254602,0.214935,0.289533,0.283830,0.637319,0.038190,0.251040,...,-0.165787,0.083103,-0.235114,0.192168,0.331576,0.138626,0.442205,-0.212605,-0.107704,-0.295033


In [18]:
df_pred_sub['id']   = dataset2019raw['id']
df_pred_sub.head()

Unnamed: 0,output_1_0,output_2_0,output_3_0,output_4_0,output_5_0,output_6_0,output_7_0,output_8_0,output_9_0,output_10_0,...,output_8_6,output_9_6,output_10_6,output_11_6,output_12_6,output_13_6,output_14_6,output_15_6,output_16_6,id
0,-0.260439,0.33312,0.538411,0.473605,0.289199,0.193781,0.522744,0.637084,-0.162097,0.285323,...,0.016658,0.248102,0.409952,0.214199,0.689965,-0.138431,0.046834,0.721454,0.706768,45958
1,-0.120107,-0.109468,-0.171653,0.057761,-0.063709,-0.328687,-0.327435,-0.429576,-0.213457,-0.093378,...,-0.011888,0.287477,-0.026305,0.071464,0.187786,-0.144245,-0.13362,0.255879,0.387679,46012
2,0.173208,0.463053,0.695319,0.276558,0.039522,0.652164,0.338811,0.003624,0.330115,0.515108,...,0.390526,-0.024801,0.602218,0.758163,0.308021,-0.135471,0.436446,0.807179,-0.30035,46066
3,0.058329,0.252301,0.556719,0.221581,-0.057954,0.65391,0.327469,0.949982,0.927877,0.365948,...,0.295272,-0.228161,0.463388,0.597859,0.522505,0.426436,0.102488,0.529745,0.638336,46120
4,0.019049,0.279683,0.36826,0.008982,-0.065971,0.10811,0.195401,0.541348,-0.067683,0.379882,...,0.084257,-0.080668,0.204948,0.296501,0.23931,-0.099085,0.329299,0.355689,-0.158616,53812


In [21]:
## submission
df_sub = []
for i, row in df_pred_sub.iterrows():
    for column, value in zip(output_columns, row.values):
        id = "{}_{}".format(int(row.id), column)
        df_sub.append([id, value])

df_sub = pd.DataFrame(df_sub)
df_sub.columns = ['id', 'value']
df_sub.to_csv(result_folder + 'RF50_predict2019_submission.csv', index=False)
df_sub

Unnamed: 0,id,value
0,45958_output_1_0,-0.260439
1,45958_output_2_0,0.333120
2,45958_output_3_0,0.538411
3,45958_output_4_0,0.473605
4,45958_output_5_0,0.289199
...,...,...
1541115,66567_output_12_6,-0.012578
1541116,66567_output_13_6,-0.121220
1541117,66567_output_14_6,-0.060399
1541118,66567_output_15_6,0.126527


In [57]:
output_len = 112
weights_dict = {
    0: 1.00,
    1: 0.75,
    2: 0.60,
    3: 0.50,
    4: 0.43,
    5: 0.38,
    6: 0.33
}
weights = [weights_dict[i // 16] for i in range(output_len)]
sum_weights = np.sum(weights)

def compute_loss(Y_true, Y_pred):
    squares = np.square(Y_true - Y_pred)
    weighted = squares * weights
    mean = np.sum(weighted) / sum_weights
    return mean

In [66]:
# Prediction 2018 error
error = 0
for i in range(112):
    error += compute_loss(predict2018[i], Y_val.iloc[i])
root_error = np.sqrt(np.sum(error)/112)
root_error

0.9728841135358685