In [508]:
import pandas as pd
from sklearn.svm import SVR
import numpy as np

def mape(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [518]:
dataset = pd.read_csv("../data/preprocessed_input_interpolate_20min_phase1and2_train.csv")
del dataset['date']

# Only select columns which are useful
selected_col = ["hour", "precipitation", "dayofweek", "is_holiday"]

# Target predict columns
predict_col = "('C', 3)"
predict_col_shifted = "predict_shift_6"

# Time shift backward for predict_col
dataset_shift_1 = dataset.shift(periods=-1)[predict_col].fillna(method="ffill")
dataset_shift_2 = dataset.shift(periods=-2)[predict_col].fillna(method="ffill")
dataset_shift_6 = dataset.shift(periods=-6)[predict_col].fillna(method="ffill")

# Add time shifted predict_col to dataset
#dataset = dataset.assign(predict_shift_1=pd.Series(dataset_shift_1).values)
#dataset = dataset.assign(predict_shift_2=pd.Series(dataset_shift_2).values)
dataset = dataset.assign(predict_shift_6=pd.Series(dataset_shift_6).values)
#selected_col.append(predict_col)
#selected_col.append("predict_shift_1")
#selected_col.append("predict_shift_2")
#selected_col.append("predict_shift_6")

training_set = dataset[:-24*3*7]
predict_set = dataset[-24*3*7:][lambda df : ((df.hour >= 6) & (df.hour < 8)) | ((df.hour >= 15) & (df.hour < 17))]

# Prepare potential features to add to the feature set
if predict_col == "(1, 0, 'tot')":
    unused_features = ["(1, 0, 'cargocar')", "(1, 0, 'etc')", "(1, 0, 'motorcycle')", "(1, 0, 'privatecar')", "(1, 0, 'tot')", "(1, 0, 'unknowncar')"]
elif predict_col == "(1, 1, 'tot')":
    unused_features = ["(1, 1, 'cargocar')", "(1, 1, 'etc')", "(1, 1, 'motorcycle')", "(1, 1, 'privatecar')", "(1, 1, 'tot')", "(1, 1, 'unknowncar')"]
elif predict_col == "(2, 0, 'tot')":
    unused_features = ["(2, 0, 'cargocar')", "(2, 0, 'etc')", "(2, 0, 'motorcycle')", "(2, 0, 'privatecar')", "(2, 0, 'tot')", "(2, 0, 'unknowncar')"]
elif predict_col == "(3, 0, 'tot')":
    unused_features = ["(3, 0, 'cargocar')", "(3, 0, 'etc')", "(3, 0, 'motorcycle')", "(3, 0, 'privatecar')", "(3, 0, 'tot')", "(3, 0, 'unknowncar')"]
elif predict_col == "(3, 1, 'tot')":
    unused_features = ["(3, 1, 'cargocar')", "(3, 1, 'etc')", "(3, 1, 'motorcycle')", "(3, 1, 'privatecar')", "(3, 1, 'tot')", "(3, 1, 'unknowncar')"]
else:
    unused_features = [predict_col, predict_col_shifted]
unused_features.extend(selected_col)
potential_features = list(training_set.columns)
potential_features = [x for x in potential_features if x not in unused_features]

In [519]:
clf = SVR(C=1.0, epsilon=0.1)
clf.fit(training_set[selected_col], training_set[predict_col_shifted])
base_score = mape(clf.predict(predict_set[selected_col]), predict_set[predict_col_shifted])
print(base_score)

21.19308805321833


In [520]:
while True:
    next_feature = None
    for col in potential_features:
        selected_col.append(col)
        clf.fit(training_set[selected_col], training_set[predict_col_shifted])
        score = mape(clf.predict(predict_set[selected_col]), predict_set[predict_col_shifted])
        if base_score > score:
            base_score = score
            next_feature = col
        selected_col.pop()
    if next_feature is not None:
        selected_col.append(next_feature)
        print(next_feature, base_score)
        potential_features.remove(next_feature)
    else:
        break
        
print(selected_col)

(1, 1, 'cargocar') 21.053472572837297
(1, 0, 'cargocar') 20.96563677026983
(1, 0, 'motorcycle') 20.90883613344979
(1, 0, 'privatecar') 20.854791055185157
(1, 1, 'unknowncar') 20.821040130334627
(2, 0, 'cargocar') 20.78948158714734
(2, 0, 'motorcycle') 20.772437843286358
(2, 0, 'privatecar') 20.749241581597968
(3, 0, 'cargocar') 20.726049485565113
(3, 0, 'motorcycle') 20.704368415814667
(3, 0, 'privatecar') 20.690271605376648
(3, 1, 'unknowncar') 20.68647974947036
['hour', 'precipitation', 'dayofweek', 'is_holiday', "(1, 1, 'cargocar')", "(1, 0, 'cargocar')", "(1, 0, 'motorcycle')", "(1, 0, 'privatecar')", "(1, 1, 'unknowncar')", "(2, 0, 'cargocar')", "(2, 0, 'motorcycle')", "(2, 0, 'privatecar')", "(3, 0, 'cargocar')", "(3, 0, 'motorcycle')", "(3, 0, 'privatecar')", "(3, 1, 'unknowncar')"]
