In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## Importando dependencias

In [1]:
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR 
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import pandas as pd
from datetime import datetime

### Funções auxiliares

In [2]:
def prepara_dataset(dataset, verbose=True):
    ids = dataset['id'].copy()
    dataset = dataset.drop(['id'], axis=1)
    dataset['date'] = dataset['date'].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d").day)

    for column in dataset:
        num_na = dataset[column].isnull().sum()
        if num_na > 0:
            placeholder = dataset[column].median()
            if np.isnan(placeholder):
                if verbose:
                    print(
                        'Empty column {:12s}... Filling with zero'.format(column))
                placeholder = 0
            dataset[column] = dataset[column].fillna(placeholder)

    for column in dataset:
        num_na = dataset[column].isnull().sum()
        assert num_na == 0
    return dataset, ids


In [3]:
def separa_dataset(dataset, com_date=True):
    cols_input = []
    cols_output = []
    for col in dataset.columns:
        if col.startswith("input") or (col.startswith('date') and com_date):
            cols_input.append(col)
        elif col.startswith("output"):
            cols_output.append(col)
        else:
            print("Unexpected column name:", col)
            continue

    inputs = dataset[cols_input]
    outputs = dataset[cols_output]
    return inputs, outputs

## Carregando os dados

In [7]:
preprocess_folder = 'KaggleDatasets/PRE/'
result_folder = 'KaggleDatasets/result/'
filename = '2019result'

#drive_path = '/content/drive/My Drive/'
filename2018 = 'KaggleDatasets/PRE/preprocessado2018.csv'
filename2019 = 'KaggleDatasets/PRE/preprocessado2019.csv'

dataset2018 = pd.read_csv(filename2018)
dataset2019 = pd.read_csv(filename2019)

In [8]:
output_columns = dataset2018.columns[dataset2018.columns.str.contains("output")]

### Pre processando os dados de saída

In [9]:
# Preprocess 2018 output dataset
inputs, outputs = separa_dataset(dataset2018)

# Parâmetros
x = 55 # x%
y = 35 # y%
z = 20 # z%

total = len(outputs)
median_list = []

variance = outputs.var()
mean = outputs.mean()

for column in outputs:
    num_na = outputs[column].isnull().sum()
    if num_na > 0:
        median_list.append(column)
        
median = outputs.median()
for column in median_list:
    outputs[column] = outputs[column].fillna(median[column])

Unexpected column name: id


### Separando dados de treino e de validação

In [20]:
# Separate data
faction_of_2018data_to_use = 0.5

inputs18 = inputs.sample(frac=faction_of_2018data_to_use)
outputs18 = outputs.sample(frac=faction_of_2018data_to_use)

X_train, X_val, Y_train, Y_val = train_test_split(
        inputs18, outputs18, test_size=0.30, random_state=57)

dataset19, outputs19 = separa_dataset(dataset2019)

print("Input:", inputs18)
print("Output:", outputs18)

Unexpected column name: id
Input:        date  input_0  input_1  input_2  input_3  input_4_1  input_5_1  \
25472     3      153        0      196       11   1.368140   1.667997   
4216      3      177        0       15       28   1.509677  -0.143717   
17119     3      293        3      181      140  -0.847522  -0.033802   
23882     4      175        2       27       37  -0.305838  -0.141271   
24656     0       70        0      171      143   0.810730   1.446824   
...     ...      ...      ...      ...      ...        ...        ...   
27567     1      142        3      198       12  -0.739185  -0.429822   
24680     0      178        0       23       35   1.355909  -0.141271   
14170     2      101        2       37       45  -0.774132  -0.211437   
26122     4      124        2       83       85  -0.520764  -0.368771   
18648     2       64        0      114      107   1.490456  -0.235802   

       input_6_1  input_7_1  input_8_1  ...  input_547_14  input_548_14  \
25472   1.1969

## Definindo modelo e treinando

In [22]:
# define model
model = LinearSVR(max_iter=500)
wrapper = MultiOutputRegressor(model)

In [23]:
# fit model
wrapper.fit(X_train, Y_train)

MultiOutputRegressor(estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0,
                                         fit_intercept=True,
                                         intercept_scaling=1.0,
                                         loss='epsilon_insensitive',
                                         max_iter=500, random_state=None,
                                         tol=0.0001, verbose=0),
                     n_jobs=None)

Prevendo resultados para o conjunto de validação

In [24]:
predict2018 = wrapper.predict(X_val)
wrapper.score(X_val, Y_val)

-1.9557364467634493

### Prevendo resultados para a submissao no Kaggle

In [25]:
# Predict 2019 results and save them
predict2019 = wrapper.predict(dataset19)
#predict2019.to_csv(result_folder + 'RF_predict2019.csv', index=False)

## Preparando os dados para a submissao

In [26]:
# Prepare Submission
df_pred_sub = pd.DataFrame(predict2019)
df_pred_sub.columns = output_columns
df_pred_sub['id']   = dataset2019['id']
df_pred_sub.head()

Unnamed: 0,output_1_0,output_2_0,output_3_0,output_4_0,output_5_0,output_6_0,output_7_0,output_8_0,output_9_0,output_10_0,...,output_8_6,output_9_6,output_10_6,output_11_6,output_12_6,output_13_6,output_14_6,output_15_6,output_16_6,id
0,0.05841,-1.941978,0.620054,-0.218404,-2.614197,-1.885596,-0.717506,-0.809196,-0.73071,-0.51231,...,1.737176,-1.480167,0.505715,-1.000641,1.981015,-1.069503,-0.148851,-0.628339,1.595796,45958
1,-0.345637,-1.763555,-1.006448,-1.238677,1.393404,-0.083126,-1.286953,-2.146786,-2.564988,-0.155299,...,0.099839,-0.677861,-2.0798,-2.277678,-0.961105,0.736262,-1.714036,-2.528708,0.504663,46012
2,0.403467,-2.240753,-1.272434,0.619603,0.428623,0.043805,-0.53023,-0.901375,-0.409321,0.213852,...,0.944439,0.54993,0.018488,-0.917282,0.249352,-1.422968,0.966263,0.265388,0.680158,46066
3,0.12211,-0.425124,0.230432,-0.38018,-0.036276,0.090393,0.753262,-0.366111,0.27533,0.82073,...,1.186905,-0.501492,0.52401,-0.20751,0.495354,0.465149,0.313253,0.644344,0.356713,46120
4,-4.107586,-4.688413,-2.835401,-2.259517,-2.351695,-1.711791,-1.905513,-3.344113,-2.460111,-2.080236,...,0.232781,-0.877815,1.038558,-0.834462,0.060524,1.133598,0.213004,2.008469,0.596834,53812


In [27]:
## submission
df_sub = []
for i, row in df_pred_sub.iterrows():
    for column, value in zip(output_columns, row.values):
        id = "{}_{}".format(int(row.id), column)
        df_sub.append([id, value])

df_sub = pd.DataFrame(df_sub)
df_sub.columns = ['id', 'value']
df_sub.to_csv(result_folder + 'SVR_predict2019_submission.csv', index=False)
df_sub

Unnamed: 0,id,value
0,45958_output_1_0,0.058410
1,45958_output_2_0,-1.941978
2,45958_output_3_0,0.620054
3,45958_output_4_0,-0.218404
4,45958_output_5_0,-2.614197
...,...,...
1541115,66567_output_12_6,0.083802
1541116,66567_output_13_6,-0.729396
1541117,66567_output_14_6,1.918636
1541118,66567_output_15_6,-0.614658


## Calculando o erro para o conjunto de validação

In [28]:
output_len = 112
weights_dict = {
    0: 1.00,
    1: 0.75,
    2: 0.60,
    3: 0.50,
    4: 0.43,
    5: 0.38,
    6: 0.33
}
weights = [weights_dict[i // 16] for i in range(output_len)]
sum_weights = np.sum(weights)

def compute_loss(Y_true, Y_pred):
    squares = np.square(Y_true - Y_pred)
    weighted = squares * weights
    mean = np.sum(weighted) / sum_weights
    return mean

In [29]:
# Prediction 2018 error
error = 0
for i in range(112):
    error += compute_loss(predict2018[i], Y_val.iloc[i])
root_error = np.sqrt(np.sum(error)/112)
root_error

1.616066125630532