In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [1]:
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR 
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import pandas as pd
from datetime import datetime

In [2]:
def prepara_dataset(dataset, verbose=True):
    ids = dataset['id'].copy()
    dataset = dataset.drop(['id'], axis=1)
    dataset['date'] = dataset['date'].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d").day)

    for column in dataset:
        num_na = dataset[column].isnull().sum()
        if num_na > 0:
            placeholder = dataset[column].median()
            if np.isnan(placeholder):
                if verbose:
                    print(
                        'Empty column {:12s}... Filling with zero'.format(column))
                placeholder = 0
            dataset[column] = dataset[column].fillna(placeholder)

    for column in dataset:
        num_na = dataset[column].isnull().sum()
        assert num_na == 0
    return dataset, ids


In [3]:
def separa_dataset(dataset, com_date=True):
    cols_input = []
    cols_output = []
    for col in dataset.columns:
        if col.startswith("input") or (col.startswith('date') and com_date):
            cols_input.append(col)
        elif col.startswith("output"):
            cols_output.append(col)
        else:
            print("Unexpected column name:", col)
            continue

    inputs = dataset[cols_input]
    outputs = dataset[cols_output]
    return inputs, outputs

In [7]:
preprocess_folder = 'KaggleDatasets/PRE/'
result_folder = 'KaggleDatasets/result/'
filename = '2019result'

#drive_path = '/content/drive/My Drive/'
filename2018 = 'KaggleDatasets/PRE/preprocessado2018.csv'
filename2019 = 'KaggleDatasets/PRE/preprocessado2019.csv'

dataset2018 = pd.read_csv(filename2018)
dataset2019 = pd.read_csv(filename2019)

In [8]:
output_columns = dataset2018.columns[dataset2018.columns.str.contains("output")]

In [9]:
# Preprocess 2018 output dataset
inputs, outputs = separa_dataset(dataset2018)

# Parâmetros
x = 55 # x%
y = 35 # y%
z = 20 # z%

total = len(outputs)
median_list = []

variance = outputs.var()
mean = outputs.mean()

for column in outputs:
    num_na = outputs[column].isnull().sum()
    if num_na > 0:
        median_list.append(column)
        
median = outputs.median()
for column in median_list:
    outputs[column] = outputs[column].fillna(median[column])

Unexpected column name: id


In [20]:
# Separate data
faction_of_2018data_to_use = 0.5

inputs18 = inputs.sample(frac=faction_of_2018data_to_use)
outputs18 = outputs.sample(frac=faction_of_2018data_to_use)

X_train, X_val, Y_train, Y_val = train_test_split(
        inputs18, outputs18, test_size=0.30, random_state=57)

dataset19, outputs19 = separa_dataset(dataset2019)

print("Input:", inputs18)
print("Output:", outputs18)

Unexpected column name: id
Input:        date  input_0  input_1  input_2  input_3  input_4_1  input_5_1  \
25472     3      153        0      196       11   1.368140   1.667997   
4216      3      177        0       15       28   1.509677  -0.143717   
17119     3      293        3      181      140  -0.847522  -0.033802   
23882     4      175        2       27       37  -0.305838  -0.141271   
24656     0       70        0      171      143   0.810730   1.446824   
...     ...      ...      ...      ...      ...        ...        ...   
27567     1      142        3      198       12  -0.739185  -0.429822   
24680     0      178        0       23       35   1.355909  -0.141271   
14170     2      101        2       37       45  -0.774132  -0.211437   
26122     4      124        2       83       85  -0.520764  -0.368771   
18648     2       64        0      114      107   1.490456  -0.235802   

       input_6_1  input_7_1  input_8_1  ...  input_547_14  input_548_14  \
25472   1.1969

In [22]:
# define model
model = LinearSVR(max_iter=500)
wrapper = MultiOutputRegressor(model)

In [12]:
# fit model
wrapper.fit(X_train, Y_train)

MultiOutputRegressor(estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0,
                                         fit_intercept=True,
                                         intercept_scaling=1.0,
                                         loss='epsilon_insensitive',
                                         max_iter=1000, random_state=None,
                                         tol=0.0001, verbose=0),
                     n_jobs=None)

In [13]:
predict2018 = wrapper.predict(X_val)
wrapper.score(X_val, Y_val)

-2.209758147136553

In [None]:
## Save what I predicted for 2018 and its true value
#predict2018.to_csv(result_folder + 'RF_predict2018.csv', index=False)
#Y_val.to_csv(result_folder + 'RF_predict2018.csv', index=False)

In [15]:
# Predict 2019 results and save them
predict2019 = wrapper.predict(dataset19)
#predict2019.to_csv(result_folder + 'RF_predict2019.csv', index=False)

In [16]:
# Prepare Submission
df_pred_sub = pd.DataFrame(predict2019)
df_pred_sub.columns = output_columns
df_pred_sub['id']   = dataset2019['id']
df_pred_sub.head()

Unnamed: 0,output_1_0,output_2_0,output_3_0,output_4_0,output_5_0,output_6_0,output_7_0,output_8_0,output_9_0,output_10_0,...,output_8_6,output_9_6,output_10_6,output_11_6,output_12_6,output_13_6,output_14_6,output_15_6,output_16_6,id
0,1.43153,-0.896022,0.224942,-1.690461,0.57442,0.995788,0.121546,0.205928,4.661618,0.356827,...,-0.908765,0.279112,-2.342654,0.947144,1.726808,-1.766269,0.79101,1.680125,1.866281,45958
1,-2.623554,-3.177371,-0.397279,1.441993,-1.794626,-2.077025,-0.894806,0.128864,1.208914,-2.933636,...,-1.136683,-0.231887,-1.960348,0.054931,-0.373214,-4.671983,-0.399054,-2.041173,-1.256665,46012
2,0.440938,2.69718,1.790295,2.462359,-1.030692,2.690288,2.282275,0.328724,-0.710685,3.826643,...,1.773921,0.532959,1.777215,4.299488,3.65877,5.88081,1.627374,2.521046,1.915341,46066
3,0.159714,0.280266,0.317067,0.1272,0.604384,0.272936,1.597654,-1.178281,0.779476,0.63877,...,-0.708929,0.290225,0.806899,2.05389,0.740565,0.564951,1.500128,1.899072,0.014482,46120
4,1.318494,0.360361,-0.784186,0.00126,0.460315,-1.840311,-0.45062,-2.290571,-5.27287,0.328521,...,-1.606328,0.011257,-2.357395,-1.359317,-0.566364,-1.556152,-0.666196,-1.156865,-1.122683,53812


In [17]:
## submission
df_sub = []
for i, row in df_pred_sub.iterrows():
    for column, value in zip(output_columns, row.values):
        id = "{}_{}".format(int(row.id), column)
        df_sub.append([id, value])

df_sub = pd.DataFrame(df_sub)
df_sub.columns = ['id', 'value']
df_sub.to_csv(result_folder + 'SVR_predict2019_submission.csv', index=False)
df_sub

Unnamed: 0,id,value
0,45958_output_1_0,1.431530
1,45958_output_2_0,-0.896022
2,45958_output_3_0,0.224942
3,45958_output_4_0,-1.690461
4,45958_output_5_0,0.574420
...,...,...
1541115,66567_output_12_6,-0.272249
1541116,66567_output_13_6,-0.079057
1541117,66567_output_14_6,0.209247
1541118,66567_output_15_6,0.158398


In [18]:
output_len = 112
weights_dict = {
    0: 1.00,
    1: 0.75,
    2: 0.60,
    3: 0.50,
    4: 0.43,
    5: 0.38,
    6: 0.33
}
weights = [weights_dict[i // 16] for i in range(output_len)]
sum_weights = np.sum(weights)

def compute_loss(Y_true, Y_pred):
    squares = np.square(Y_true - Y_pred)
    weighted = squares * weights
    mean = np.sum(weighted) / sum_weights
    return mean

In [19]:
# Prediction 2018 error
error = 0
for i in range(112):
    error += compute_loss(predict2018[i], Y_val.iloc[i])
root_error = np.sqrt(np.sum(error)/112)
root_error

1.51981809128518