<a href="https://colab.research.google.com/github/ekingit/hackathon/blob/main/evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score

from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/hackathon/plot.py /content/plot.py

from plot import create_plots

Mounted at /content/drive


In [7]:
!pip install tsfresh
from tsfresh import extract_features
import pandas as pd


def get_features(df, label):
    df = df.astype("float32")
    df_features = extract_features(
        df.reset_index().melt(id_vars = 'timestamp', var_name = 'id', value_name = 'value'),
            column_id = 'id',
            column_sort = 'timestamp',
            column_value = 'value',
            n_jobs = 1,  # Use single process to avoid progress bar issues
            disable_progressbar = False,
            default_fc_parameters = {
                "absolute_sum_of_changes": None,
                "count_above_mean": None,
                "has_duplicate": None,
                "has_duplicate_max": None,
                "has_duplicate_min": None,
                "longest_strike_above_mean": None,
                "mean_abs_change": None,
                "percentage_of_reoccurring_values_to_all_values": None,
                "root_mean_square": None,
                "mean": None,
                "median": None,
                "skewness": None,
                "sum_values": None,
                "variance": None,
                "benford_correlation": None,
                "cid_ce": [{"normalize": True}],
                "fft_aggregated": [{"aggtype": "centroid"}],

            }
    )
    df_features['label'] = label
    return df_features


def create_df_features(df_real, df_synth):
    df_real_features = get_features(df_real, 1)
    df_synth_features = get_features(df_synth, 0)
    df_features = pd.concat([df_real_features, df_synth_features])
    df_features = df_features.loc[:, df_features.isna().sum() == 0]
    return df_features

Collecting tsfresh
  Downloading tsfresh-0.21.0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting pywavelets (from tsfresh)
  Downloading pywavelets-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting stumpy>=1.7.2 (from tsfresh)
  Downloading stumpy-1.13.0-py3-none-any.whl.metadata (28 kB)
Collecting scipy>=1.14.0 (from tsfresh)
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Downloading tsfresh-0.21.0-py2.py3-none-any.whl (96 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.0/96.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hDown

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
real_path = '/content/drive/MyDrive/hackathon/smart_meters_london_2013.csv'
synth_path = '/content/drive/MyDrive/hackathon/synthetic_data.csv'

df_real = pd.read_csv(real_path, parse_dates = ['timestamp']).set_index('timestamp')
df_synth = pd.read_csv(synth_path, parse_dates = ['timestamp']).set_index('timestamp')

In [4]:
fig_dict, rmse_dict = create_plots(df_real, df_synth)

In [5]:
score = 0
for item in rmse_dict.values():
    score += item.loc[item['statistic'] != 'median', 'value'].sum()

print(score)

17.309769203290607


In [8]:
df_features = create_df_features(df_real, df_synth)

X = df_features.drop('label', axis = 1).astype(float)
y = df_features['label'].astype(int)

Feature Extraction: 100%|██████████| 4125/4125 [02:08<00:00, 32.10it/s]
Feature Extraction: 100%|██████████| 4126/4126 [01:52<00:00, 36.59it/s]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = xgb.XGBClassifier(eval_metric = 'logloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy:.4f}')

Test accuracy: 0.9994


In [10]:
featureImportance = model.feature_importances_
df_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': featureImportance
})
df_importance = df_importance.sort_values(by= 'Importance', ascending = False)
df_importance

Unnamed: 0,Feature,Importance
7,value__percentage_of_reoccurring_values_to_all...,0.993071
1,value__count_above_mean,0.003387
13,value__variance,0.002063
14,value__cid_ce__normalize_True,0.00087
0,value__absolute_sum_of_changes,0.000219
4,value__has_duplicate_min,0.000166
10,value__median,0.000126
8,value__root_mean_square,3.9e-05
11,value__skewness,3.2e-05
5,value__longest_strike_above_mean,2.6e-05
