In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, boxcox, pearsonr, ks_2samp

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures

import json
import os
os.chdir("..")

def metric_train(output, truth):
    return spearmanr(output, truth).correlation

In [2]:
EXOTIC_PATH = "data/processed/exotic"
NON_EXOTIC_PATH = "data/processed/non_exotic"
FULL_PATH = "data/processed/full"

In [3]:
X_train = pd.read_csv("data/raw/X_train_NHkHMNU.csv").set_index("ID")
y_train = pd.read_csv("data/raw/y_train_ZAN5mwg.csv").set_index("ID")
X_test = pd.read_csv("data/raw/X_test_final.csv").set_index("ID")

feature_cols = X_train.columns.drop(["DAY_ID", "COUNTRY"]).to_list()

X_train.shape, y_train.shape, X_test.shape

((1494, 34), (1494, 1), (654, 34))

In [4]:
country_code = {"FR": 0, "DE": 1}

full_train = y_train.join(X_train)
full_train = full_train.replace(country_code)

full_train["RANK"] = full_train["TARGET"].rank()

X_test = X_test.replace(country_code)

In [5]:
def split_exotic_idx(df: pd.DataFrame):
    day_count = df.groupby("DAY_ID")["COUNTRY"].count()
    exotic_day = day_count[day_count == 1].index.to_list()
    non_exotic_day = day_count[day_count == 2].index.to_list()
    return exotic_day, non_exotic_day

In [6]:
exotic_train_idx, non_exotic_train_idx = split_exotic_idx(full_train)
exotic_train_df = full_train.loc[full_train["DAY_ID"].isin(exotic_train_idx)]
non_exotic_train_df = full_train.loc[full_train["DAY_ID"].isin(non_exotic_train_idx)]

In [7]:
exotic_test, non_exotic_test = split_exotic_idx(X_test)
exotic_test_X = X_test.loc[X_test["DAY_ID"].isin(exotic_test)]
non_exotic_test_X = X_test.loc[X_test["DAY_ID"].isin(non_exotic_test)]

In [8]:
print(exotic_train_df.columns[exotic_train_df.isna().any(axis=0) != 0])
print(non_exotic_train_df.columns[non_exotic_train_df.isna().any(axis=0) != 0])
print(exotic_test_X.columns[exotic_test_X.isna().any(axis=0) != 0])
print(non_exotic_test_X.columns[non_exotic_test_X.isna().any(axis=0) != 0])

Index(['DE_FR_EXCHANGE', 'FR_DE_EXCHANGE', 'DE_NET_EXPORT', 'FR_NET_EXPORT',
       'DE_NET_IMPORT', 'FR_NET_IMPORT'],
      dtype='object')
Index(['DE_RAIN', 'FR_RAIN', 'DE_WIND', 'FR_WIND', 'DE_TEMP', 'FR_TEMP'], dtype='object')
Index(['DE_FR_EXCHANGE', 'FR_DE_EXCHANGE', 'DE_NET_EXPORT', 'FR_NET_EXPORT',
       'DE_NET_IMPORT', 'FR_NET_IMPORT'],
      dtype='object')
Index(['DE_RAIN', 'FR_RAIN', 'DE_WIND', 'FR_WIND', 'DE_TEMP', 'FR_TEMP'], dtype='object')


In [9]:
# Median fillna
exotic_train_df = exotic_train_df.fillna(exotic_train_df.median())
exotic_test_X = exotic_test_X.fillna(exotic_test_X.median())

def fill_median_by_country(df: pd.DataFrame, df_fit: pd.DataFrame):
    df = df.copy()
    median_filler = df_fit.loc[df_fit.COUNTRY == 0].median()
    df.loc[df.COUNTRY == 0] = df.loc[df.COUNTRY == 0].fillna(median_filler)
    median_filler = df_fit.loc[df_fit.COUNTRY == 1].median()
    df.loc[df.COUNTRY == 1] = df.loc[df.COUNTRY == 1].fillna(median_filler)
    return df

non_exotic_train_df = fill_median_by_country(non_exotic_train_df, non_exotic_train_df)
non_exotic_test_X = fill_median_by_country(non_exotic_test_X, non_exotic_train_df)

full_train = fill_median_by_country(full_train, full_train)
X_test = fill_median_by_country(X_test, full_train)

In [10]:
# STAGE = "median_imputed"
# exotic_train_df.to_csv(f"{EXOTIC_PATH}/{STAGE}_train.csv")
# exotic_test_X.to_csv(f"{EXOTIC_PATH}/{STAGE}_test.csv")
# non_exotic_train_df.to_csv(f"{NON_EXOTIC_PATH}/{STAGE}_train.csv")
# non_exotic_test_X.to_csv(f"{NON_EXOTIC_PATH}/{STAGE}_test.csv")
# full_train.to_csv(f"{FULL_PATH}/{STAGE}_train.csv")
# X_test.to_csv(f"{FULL_PATH}/{STAGE}_test.csv")

def save_by_region(df, path):
    df.loc[df.COUNTRY == 0].to_csv(f"{path}_fr.csv")
    df.loc[df.COUNTRY == 1].to_csv(f"{path}_de.csv")
    return

# save_by_region(non_exotic_train_df, f"{NON_EXOTIC_PATH}/{STAGE}_train")
# save_by_region(non_exotic_test_X, f"{NON_EXOTIC_PATH}/{STAGE}_test")

In [11]:
def add_interactions(df):
    pf = PolynomialFeatures(2, interaction_only=True, include_bias=False)
    val = pf.fit_transform(df)
    cols = pf.get_feature_names_out()
    return pd.DataFrame(data=val, columns=cols).set_index(df.index)

non_exotic_train_d2_feat = add_interactions(non_exotic_train_df[feature_cols])
exotic_train_d2_feat = add_interactions(exotic_train_df[feature_cols])

non_exotic_test_d2_feat = add_interactions(non_exotic_test_X[feature_cols])
exotic_test_d2_feat = add_interactions(exotic_test_X[feature_cols])

feature_cols_d2 = non_exotic_train_d2_feat.columns.to_list()
len(feature_cols)

32

In [44]:
non_feat_cols = ["DAY_ID", "COUNTRY", "TARGET", "RANK"]
non_exotic_train_df = non_exotic_train_df[non_feat_cols].join(non_exotic_train_d2_feat)
exotic_train_df = exotic_train_df[non_feat_cols].join(exotic_train_d2_feat)

non_feat_cols = ["DAY_ID", "COUNTRY"]
non_exotic_test_X = non_exotic_test_X[non_feat_cols].join(non_exotic_test_d2_feat)
exotic_test_X = exotic_test_X[non_feat_cols].join(exotic_test_d2_feat)

# STAGE = "median_imputed_d2"
# exotic_train_df.to_csv(f"{EXOTIC_PATH}/{STAGE}_train.csv")
# exotic_test_X.to_csv(f"{EXOTIC_PATH}/{STAGE}_test.csv")
# non_exotic_train_df.to_csv(f"{NON_EXOTIC_PATH}/{STAGE}_train.csv")
# non_exotic_test_X.to_csv(f"{NON_EXOTIC_PATH}/{STAGE}_test.csv")

# save_by_region(non_exotic_train_df, f"{NON_EXOTIC_PATH}/{STAGE}_train")
# save_by_region(non_exotic_test_X, f"{NON_EXOTIC_PATH}/{STAGE}_test")

len(non_exotic_train_df.columns)

532

In [13]:
non_exotic_train_df_fr = non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 0]
non_exotic_train_df_de = non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 1]

non_exotic_test_X_fr = non_exotic_test_X.loc[non_exotic_test_X.COUNTRY == 0]
non_exotic_test_X_de = non_exotic_test_X.loc[non_exotic_test_X.COUNTRY == 1]

In [21]:
def different_dist(df1: pd.DataFrame, df2: pd.DataFrame, pvalue=0.05, features=feature_cols_d2):
    return [col for col in features if ks_2samp(df1[col], df2[col]).pvalue < pvalue]

A very lazy KS test that shows the dataset compose of "three" sets
- France ("non-exotic")
- Germany ("non-exotic")
- France ("exotic", Germany does not have these DAY_IDs)

Summary:
- Features are the same for non-exotic France and non-exotic Germany
- Features distributions are different for non-exotic and exotic
- Targets distribution is different for non-exotic France and non-exotic Germany
- Targets distribution is the same for non-exotic and exotic France

In [22]:
# Non-exotic X's are same dist
ks_non_exotic = different_dist(non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 0], non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 1])
print(
    "Number of features having different distributions for FR non-exotic and DE non-exotic:", 
    len(ks_non_exotic)
)
# Exotic X's are different dist from non-exotic X's
ks_exotic = different_dist(non_exotic_train_df, exotic_train_df)
ks_fr_exotic = different_dist(non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 0], exotic_train_df)
ks_de_exotic = different_dist(non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 1], exotic_train_df)
print(
    "Number of features having different distributions for non-exotic and exotic:", 
    len(ks_exotic)
)
print(
    "Number of features having different distributions for FR non-exotic and exotic:", 
    len(ks_fr_exotic)
)
print(
    "Number of features having different distributions for DE non-exotic and exotic:", 
    len(ks_de_exotic)
)
# Non-exotic targets are different distributions by country
print(
    "FR non-exotic and DE non-exotic target sharing different distribution?", 
    ks_2samp(
        non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 0, "TARGET"], 
        non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 1, "TARGET"],
    ).pvalue < 0.05
)
print(
    "FR non-exotic and exotic target sharing different distribution?",
    ks_2samp(
        non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 0, "TARGET"], 
        exotic_train_df["TARGET"],
    ).pvalue < 0.05
)
print(
    "DE non-exotic and exotic target sharing different distribution?",
    ks_2samp(
        non_exotic_train_df.loc[non_exotic_train_df.COUNTRY == 1, "TARGET"], 
        exotic_train_df["TARGET"],
    ).pvalue < 0.05
)

Number of features having different distributions for FR non-exotic and DE non-exotic: 0
Number of features having different distributions for non-exotic and exotic: 403
Number of features having different distributions for FR non-exotic and exotic: 386
Number of features having different distributions for DE non-exotic and exotic: 386
FR non-exotic and DE non-exotic target sharing different distribution? True
FR non-exotic and exotic target sharing different distribution? False
DE non-exotic and exotic target sharing different distribution? True


Finding features distributions drift between train set and test set

In [27]:
# common_feature_cols = list(set(feature_cols).difference(ks_exotic))

exotic_drift_cols = different_dist(exotic_train_df, exotic_test_X)
non_exotic_drift_cols = different_dist(non_exotic_train_df, non_exotic_test_X)
len(exotic_drift_cols), len(non_exotic_drift_cols)

(26, 195)

In [33]:
class feat_selector():
    def __init__(self, corr, target="TARGET"):
        self.corr = corr
        self.target = target

    def multi_colinear_feat(self, thres):
        # upper triangle masking for correlation matrix
        tri_mask = np.triu(np.ones(self.corr.shape, dtype=np.bool_))
        # finding multicollinear variable pairs
        multi_collin = self.corr[self.corr.mask(tri_mask).abs() > thres]
        # extracting the pairs
        mc_idxs = np.stack(multi_collin.notnull().values.nonzero()).T.tolist()
        mc_cols = [[self.corr.columns[i1], self.corr.columns[i2]] for (i1, i2) in mc_idxs]

        return [
            c2 if abs(self.corr[self.target][c1]) < abs(self.corr[self.target][c2]) else c1 for c1, c2 in mc_cols
            ]
    
    def weak_feat(self, thres):
        weak_flag = self.corr[self.target].abs() < thres
        return weak_flag.index[weak_flag].to_list()

    def run(self, multi_collinear_thres=0.8, weak_thres=0.05):
        return list(set(self.multi_colinear_feat(multi_collinear_thres) + self.weak_feat(weak_thres)))

In [37]:
# Removing drifted features
non_exotic_remainder = list(set(feature_cols_d2).difference(non_exotic_drift_cols))
# feature selection by correlation
tmp_corr = non_exotic_train_df_fr[["TARGET"] + non_exotic_remainder].corr("spearman")
# Finding out the multi-collinear features, and weakly correlated features with targets
non_exotic_fr_drop_cols = feat_selector(tmp_corr).run(multi_collinear_thres=0.7)
# Removing them, for linear models
non_exotic_fr_feature_cols = list(set(non_exotic_remainder).difference(non_exotic_fr_drop_cols))

tmp_corr["TARGET"][non_exotic_fr_feature_cols]

DE_COAL FR_WINDPOW             0.056917
DE_NUCLEAR GAS_RET            -0.072557
DE_NET_EXPORT                 -0.052830
DE_NUCLEAR DE_RESIDUAL_LOAD   -0.054144
DE_NET_IMPORT CARBON_RET       0.072136
                                 ...   
DE_COAL DE_NUCLEAR             0.060560
GAS_RET                        0.151401
DE_NET_EXPORT FR_GAS           0.055869
FR_GAS FR_HYDRO                0.064794
FR_COAL DE_TEMP                0.053265
Name: TARGET, Length: 75, dtype: float64

In [38]:
tmp_corr = non_exotic_train_df_de[["TARGET"]+non_exotic_remainder].corr("spearman")
non_exotic_de_drop_cols = feat_selector(tmp_corr).run(multi_collinear_thres=0.7)
non_exotic_de_feature_cols = list(set(non_exotic_remainder).difference(non_exotic_de_drop_cols))

tmp_corr["TARGET"][non_exotic_de_feature_cols]

DE_COAL FR_WINDPOW             0.166102
FR_COAL DE_LIGNITE            -0.053679
DE_WIND                       -0.147871
DE_WINDPOW FR_WIND            -0.086121
FR_HYDRO FR_WINDPOW            0.065840
                                 ...   
FR_NET_EXPORT FR_NET_IMPORT   -0.082838
FR_WINDPOW DE_LIGNITE          0.154410
FR_COAL DE_RESIDUAL_LOAD      -0.188544
DE_GAS FR_RAIN                -0.155213
DE_WINDPOW                    -0.300933
Name: TARGET, Length: 63, dtype: float64

In [41]:
exotic_remainder = list(set(feature_cols_d2).difference(exotic_drift_cols))

tmp_corr = exotic_train_df[["TARGET"]+exotic_remainder].corr("spearman")
exotic_drop_cols = feat_selector(tmp_corr).run(multi_collinear_thres=0.7)
exotic_feature_cols = list(set(exotic_remainder).difference(exotic_drop_cols))

tmp_corr["TARGET"][exotic_feature_cols]

FR_COAL DE_WINDPOW           0.185677
FR_COAL FR_SOLAR            -0.051922
FR_NET_EXPORT DE_COAL        0.085119
DE_HYDRO DE_RAIN            -0.055944
DE_RESIDUAL_LOAD COAL_RET    0.124131
                               ...   
FR_SOLAR GAS_RET            -0.064669
DE_RAIN GAS_RET             -0.102447
GAS_RET CARBON_RET          -0.077973
FR_RAIN FR_TEMP              0.070030
FR_NET_IMPORT CARBON_RET    -0.155249
Name: TARGET, Length: 62, dtype: float64

In [42]:
feature_selection = {
    "de": non_exotic_de_feature_cols,
    "fr": non_exotic_fr_feature_cols,
    "exotic": exotic_feature_cols,
}

In [43]:
# Serialize selected features for later use
# json.dump(feature_selection, open("features/feature_d2_selection.json", "w"))