In [None]:
import pandas as pd
from sklearn.metrics import r2_score
import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt

import tqdm

from classes.intrafeature import *

In [None]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

### Idea

Submissions are limited, thus we need a way to quickly screen if a prediction is usable as anomaly detector

**Given** Assumed ratio of anomalies R

* For each feature in group of features:
    * Predict feature based on remaining features
    * Classify everything in the R quantile and 1-R quantile as anomalies
    * Evaluate plausibility of anomaly detector (e.g., how coherent are the anomalies?)

### Plausibility
I assume plausibility to be reflected in the distribution of block-lengths of connected anomalies.
For example: A detector that produces mainly isolated one-point anomalies is nonsense.

## Model configuration and group of features to investigate:

In [None]:
from itertools import combinations

exclude_features = ["id", "Datum_Uhrzeit_CET", "controlArea", "participationCMO", "participationIN"]
all_features = [feature for feature in train_df.columns if not feature in exclude_features]

In [None]:
hyperparameters = {"max_depth": 5}
regression_class = DecisionTreeRegressor
group = ["aFRRactivation", "aFRRrequest"]#"correction", "correctedDemand", "Demand"]

groups = [group]

In [None]:
selector_control_area = test_df.controlArea == 1

for group in tqdm.tqdm(combinations(all_features, 2)):
    print(f"Group {group}")
    model_dict = train_intrafeature(train_df, 
                                test_df,
                                group=group, 
                                regression_class=regression_class, 
                                hyperparameters=hyperparameters,
                                metric=r2_score)
    
    best_target_name = max(model_dict, key=lambda x: model_dict[x][2])
    model, remaining, train_score, val_score = model_dict[best_target_name]
    if val_score < 0.8:
        print(f"Val score {val_score}. Skipping")
    else:
        print(f"Target {best_target_name} with scores ({train_score} | {val_score})")
    target, pred = run_intrafeature_model(test_df, model, best_target_name, remaining)
    anomalies = hypothesize_anomalies(target[selector_control_area], pred[selector_control_area], 0.15)
    anomalies = anomalies.rename("anomaly")    

In [None]:
best_target_name

In [None]:
def fill_anomalies(df, window_size=4, threshold=2, loops=2):
    count = 0
    for l in range(loops):
        for index, (_, row) in enumerate(df.iterrows()):
            if row["anomaly"] == 0:
                print(index)
                start_index = max(index - window_size, 0)
                end_index = min(index + window_size + 1, len(df))
                window = df["anomaly"][start_index:end_index]
                # Prüfe, ob mindestens eine '1' im Bereich vor und nach der '0' ist
                if 1 in df["anomaly"][start_index:index].values and 1 in df["anomaly"][index + 1: end_index].values:
                    window = df["anomaly"][start_index:end_index]
                    if window.sum() >= threshold:
                        df.at[index, "anomaly"] = 1
                        count += 1
    print("Gefüllt:", count)

    return df


def remove_anomalies(df, window_size=5, threshold=1):
    count = 0
    for index, (_, row) in enumerate(df.iterrows()):
        if row["anomaly"] == 1:
            start_index = max(index - window_size, 0)
            end_index = min(index + window_size + 1, len(df))

            if 0 in df["anomaly"][start_index:index].values and 0 in df["anomaly"][index + 1: end_index].values:
                window = df["anomaly"][start_index:end_index]
                if window.sum() <= threshold:
                    df.at[index, "anomaly"] = 0
                    count += 1
    print("Entfernt:", count)

    return df


df_filled = fill_anomalies(
    pd.DataFrame(anomalies).copy(), window_size=10, threshold=4, loops=2)
submission_df = remove_anomalies(
    df_filled.copy(), window_size=5, threshold=4)


In [None]:
cutoff = 500
counts = count_anomagrams(submission_df["anomaly"])
counts = counts[counts < cutoff]
hist = plt.hist(np.log(counts), bins=80)

In [None]:
anomalies.sum()

In [None]:
d = {"a": (2, 4), "b": (3, 2)}