In [1]:
import pandas as pd
import numpy as np

In [2]:
ts_df = pd.read_csv("Nuclear Capacity.csv", sep=",")

In [3]:
ts_df.shape

(5225, 2)

In [4]:
def extract_date(x):
    parts = x.split()
    year = parts[0]
    month = parts[1][:2]
    day = parts[1][2:]
    return "%s-%s-%s" % (year, month, day)

In [5]:
ts_df["date"] = ts_df["Unnamed: 0"].apply(extract_date)

In [6]:
ts_df.drop("Unnamed: 0", axis=1, inplace=True)

In [7]:
ts_df.columns = ["value","date"]

In [8]:
ts_df.head()

Unnamed: 0,value,date
0,101764.9,2007-01-01
1,101764.9,2007-01-02
2,101764.9,2007-01-03
3,101764.9,2007-01-04
4,101764.9,2007-01-05


# TODO: remove rows based on shift by!!!

In [9]:
class TSFeatureCalculator():
    def __init__(self, df, date_col, value_col, shift_by=1, weekday_only=False):
        self.df = df
        self.date_col = date_col
        self.value_col = value_col
        self.weekday_only = weekday_only
        self.shift_by = shift_by
        self._preprocess()
        
    def _preprocess(self):
        self.df["dt"] = pd.to_datetime(self.df[self.date_col])
        self.df["x"] = self.df[self.value_col].shift(self.shift_by)
        self.df["day_of_week"] = self.df["dt"].apply(lambda x: x.dayofweek)
        self.df["month"] = self.df["dt"].apply(lambda x: x.month)
        self.df["day_of_month"] = self.df["dt"].apply(lambda x: x.day)
        self.df = self.df.set_index("dt")
        print(self.df.shape)
        if self.weekday_only:
            print("Excluding weekends!")
            self.df = self.df[self.df["day_of_week"]<5]
            print(self.df.shape)
            
    def generate_window_features(self, window_size=[3,7], window_stats=["mean"], min_periods=None, window_type=None):
        for window in window_size:
            for stat in window_stats:
                if window_type != None:
                    col_name = "%i%s_%s" % (window, window_type, stat)
                    window_setup = "%i%s" % (window, window_type)
                else:
                    col_name = "%i_%s" % (window, stat)
                    window_setup = window
                self.df[col_name] = self.df["x"].rolling(window_setup, min_periods=min_periods).agg(stat)
        print("done")
        
class TSClassifier():
    def __init__(self, df, target_col, missing_value=0.0, exclude=None):
        self.df = df
        self.target_col = target_col
        self.exclude = exclude
        self.missing_value = missing_value
        self._preprocess()
        
    def _preprocess(self):
        if self.exclude != None:
            self.df = self.df.drop(self.exclude, axis=1)
        self.df = self.df.fillna(self.missing_value)
        self.X = self.df.drop(self.target_col, axis=1)
        self.y = list(self.df[self.target_col])
        
    def temporal_train_test_split(self, test_ratio=0.3):
        test_size = int(len(self.df)*test_ratio)
        train_size = len(self.df)-test_size
        X_te = self.X.tail(test_size)
        X_tr = self.X.head(train_size)
        y_te = self.y[-test_size:]
        y_tr = self.y[:train_size]
        return X_tr, y_tr, X_te, y_te
    
    def get_feature_importance(self, importances):
        pairs = list(zip(self.X.columns, importances))
        return pd.DataFrame(pairs, columns=["name","value"]).sort_values("value")
    
tsf = TSFeatureCalculator(ts_df, "date", "value", True)
tsf.generate_window_features(window_size=[3,7,14,30,60,90], window_stats=["mean","std"], min_periods=1)#, window_type="D")

(5225, 6)
done


In [10]:
tsc = TSClassifier(tsf.df, "value", exclude=["date","day_of_week","month","day_of_month"])
X_tr, y_tr, X_te, y_te = tsc.temporal_train_test_split()
print(len(X_tr), len(X_te))

3658 1567


In [11]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

models = [LinearRegression(), Lasso(), Ridge(), ElasticNet(), DecisionTreeRegressor(max_depth=3), GradientBoostingRegressor(n_estimators=30, max_depth=3)]
for model in models:
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    rmse = np.sqrt(mean_squared_error(y_te, y_pred))
    mape = mean_absolute_percentage_error(y_te, y_pred)
    print(rmse, mape)

3870.2195128701114 0.015329814485910188
3888.187783691275 0.015251206961550346
3870.219491656252 0.01532981443283717
3888.24801954974 0.015251336037053208
4073.2335985355585 0.012211169606815515


  positive)
  positive)


4128.169631842453 0.012132645424092859


In [12]:
tsc.get_feature_importance(models[0].coef_)

Unnamed: 0,name,value
11,90_mean,-1.142737
4,7_std,-0.682136
2,3_std,-0.429984
1,3_mean,-0.345907
7,30_mean,-0.221623
10,60_std,-0.126108
8,30_std,-0.098764
6,14_std,-0.020011
5,14_mean,-0.000224
12,90_std,0.009099


In [13]:
tsc.get_feature_importance(models[-1].feature_importances_)

Unnamed: 0,name,value
2,3_std,0.0
4,7_std,0.0
6,14_std,0.0
8,30_std,4e-05
9,60_mean,6.8e-05
10,60_std,0.000154
5,14_mean,0.000247
7,30_mean,0.000285
12,90_std,0.001007
11,90_mean,0.001992


# TODO: shap!!!