# Кейс 2. Поиск аномалий в данных с дорожной инфраструктуры

Дорожно-транспортная инфраструктура включает в себя огромное количество станций,  которые фиксируют большое количество показателей, таких как: температура дорожного покрытия, скорость ветра, наличие тумана и другие условия на определенных дорожных участках. Зачастую оборудование (датчики) этих станций могут передавать некорректные данные на сервер в связи с выходом этих компонентов из строя. Для бригад, которые обслуживают данные дорожно-транспортные объекты очень важно своевременно определять выход из строя компонентов (датчиков) дорожных станций и проводить их ремонт и диагностику. 

Вам предстоит написать алгоритм машинного обучения, который сможет проанализировать пакеты данных станции за определенный промежуток и найти в этих данных аномалии. 

📌 Важно: в некоторых пакетах может содержаться значение null для определенного датчика. Такие значения также считаются аномальными, так как сигнализируют об ошибке оборудования

🏡 Данный кейс ведётся в Kaggle, там вы сможете скачать (https://www.kaggle.com/competitions/misis-hack/overview) датасет, отправить решение тестовых данных для выставления баллов.

### Описание данных: 
- configuration_item_id — ID дорожной станции
- ts — время отправки пакета данных станцией
- __insert_ts — время добавления пакета в бд
- keys -  массив с названием компонента
- values — и массив с его значением

In [1]:
import os
import ast
import pandas as pd
import numpy as np


curr_dir = "case_2"
if os.getcwd().split("/")[-1] != curr_dir:
    os.chdir(curr_dir)

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

print(train_df["keys"].unique(), test_df["keys"].unique(), sep="\n")

def convert_to_float(values):
    return [float(x) if x != 'null' else None for x in values]

def prepare_df(df):
    for k in ["keys", "values"]:
        df[k] = df[k].apply(lambda i: ast.literal_eval(i)[0])

    df['values'] = df['values'].apply(convert_to_float)
    df['ts'] = pd.to_datetime(df['ts'])
    
    expanded_df = pd.concat([pd.DataFrame([row['values']], columns=row['keys']) for _, row in df.iterrows()])
    expanded_df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, expanded_df], axis=1)
    df.drop(columns=['keys', 'values', '__insert_ts'], inplace=True)
    df_sorted = df.sort_values(by=['configuration_item_id', 'ts']).reset_index(drop=True)
    
    return df_sorted

["[['meteo_layer_type', 'meteo_cloudiness', 'meteo_wind_velocity', 'meteo_humidity', 'meteo_t_underroad', 'meteo_freezing_point', 'meteo_wind_direction', 'meteo_dew_point', 'meteo_t_road', 'meteo_wind_gusts', 'meteo_t_air', 'meteo_air_pressure']]"]
["[['meteo_layer_type', 'meteo_cloudiness', 'meteo_wind_velocity', 'meteo_humidity', 'meteo_t_underroad', 'meteo_freezing_point', 'meteo_wind_direction', 'meteo_dew_point', 'meteo_t_road', 'meteo_wind_gusts', 'meteo_t_air', 'meteo_air_pressure']]"]


In [3]:
all_df = pd.concat([train_df, test_df])
all_df.reset_index(drop=True, inplace=True)
all_df = prepare_df(all_df)

all_features = [k for k in all_df.columns if k.startswith("meteo")]

In [4]:
all_df[all_df.duplicated()]

Unnamed: 0,id,configuration_item_id,ts,meteo_layer_type,meteo_cloudiness,meteo_wind_velocity,meteo_humidity,meteo_t_underroad,meteo_freezing_point,meteo_wind_direction,meteo_dew_point,meteo_t_road,meteo_wind_gusts,meteo_t_air,meteo_air_pressure


In [5]:
all_df.configuration_item_id.unique()

array([24445, 30928])

In [6]:
config_df = all_df[all_df.configuration_item_id == all_df.configuration_item_id.unique()[0]]
config_df.describe()

Unnamed: 0,id,configuration_item_id,meteo_layer_type,meteo_cloudiness,meteo_wind_velocity,meteo_humidity,meteo_t_underroad,meteo_freezing_point,meteo_wind_direction,meteo_dew_point,meteo_t_road,meteo_wind_gusts,meteo_t_air,meteo_air_pressure
count,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0
mean,2261.0,24445.0,1.095984,2.841332,2.720372,58.265426,28.426445,-0.001675,165.469148,-8.106268,28.667581,5.04907,0.0,748.401567
std,294.881615,0.0,0.485671,1.375662,1.907266,21.786618,7.693211,0.030867,122.87846,5.204474,9.624884,2.957044,0.0,4.301545
min,1751.0,24445.0,1.0,2.0,0.0,18.2,8.8,-0.57,0.0,-21.4,6.2,0.0,0.0,739.0
25%,2006.0,24445.0,1.0,2.0,1.2,39.7,23.1,0.0,36.0,-12.1,21.9,2.6,0.0,745.0
50%,2261.0,24445.0,1.0,3.0,2.4,56.9,27.9,0.0,118.0,-7.5,26.8,4.7,0.0,748.0
75%,2516.0,24445.0,1.0,3.0,3.9,77.0,34.1,0.0,282.0,-3.5,36.2,7.0,0.0,751.0
max,2771.0,24445.0,5.0,10.0,9.1,100.0,45.5,0.0,359.0,0.0,51.6,17.4,0.0,762.0


In [7]:
config_df.corr(numeric_only=True)

Unnamed: 0,id,configuration_item_id,meteo_layer_type,meteo_cloudiness,meteo_wind_velocity,meteo_humidity,meteo_t_underroad,meteo_freezing_point,meteo_wind_direction,meteo_dew_point,meteo_t_road,meteo_wind_gusts,meteo_t_air,meteo_air_pressure
id,1.0,,0.119709,0.138673,0.054226,0.368372,0.335213,-0.011297,0.258273,0.370233,0.273399,0.070746,,-0.664825
configuration_item_id,,,,,,,,,,,,,,
meteo_layer_type,0.119709,,1.0,0.159285,0.079807,0.204748,-0.038386,-0.250202,0.005734,0.18494,-0.099166,0.178644,,-0.215097
meteo_cloudiness,0.138673,,0.159285,1.0,0.09562,0.182104,-0.031507,-0.006264,-0.028454,0.143313,-0.03904,0.070121,,-0.159041
meteo_wind_velocity,0.054226,,0.079807,0.09562,1.0,-0.381727,0.250906,0.036651,-0.112713,-0.384024,0.332008,0.876701,,-0.033956
meteo_humidity,0.368372,,0.204748,0.182104,-0.381727,1.0,-0.490148,-0.061745,0.103306,0.985655,-0.580395,-0.397088,,-0.461103
meteo_t_underroad,0.335213,,-0.038386,-0.031507,0.250906,-0.490148,1.0,0.016189,0.272474,-0.488956,0.954348,0.315393,,-0.247346
meteo_freezing_point,-0.011297,,-0.250202,-0.006264,0.036651,-0.061745,0.016189,1.0,0.030558,-0.057462,0.028972,0.037023,,0.026114
meteo_wind_direction,0.258273,,0.005734,-0.028454,-0.112713,0.103306,0.272474,0.030558,1.0,0.101776,0.220679,-0.067279,,-0.161325
meteo_dew_point,0.370233,,0.18494,0.143313,-0.384024,0.985655,-0.488956,-0.057462,0.101776,1.0,-0.582871,-0.401648,,-0.471409


In [8]:
from itertools import combinations

feature_combinations = list(combinations(all_features, 2))

correlation_results = []

for feat1, feat2 in feature_combinations:
    correlation = config_df[feat1].corr(config_df[feat2])
    correlation_results.append((feat1, feat2, correlation))

correlation_df = pd.DataFrame(correlation_results, columns=['Feature1', 'Feature2', 'Correlation']).sort_values(by='Correlation', ascending=False)
print("Correlation of each feature with every other two features:")
print(correlation_df[correlation_df.Correlation.abs() > 0.5])

Correlation of each feature with every other two features:
               Feature1          Feature2  Correlation
33       meteo_humidity   meteo_dew_point     0.985655
41    meteo_t_underroad      meteo_t_road     0.954348
27  meteo_wind_velocity  meteo_wind_gusts     0.876701
34       meteo_humidity      meteo_t_road    -0.580395
56      meteo_dew_point      meteo_t_road    -0.582871


In [9]:
def calculate_ema(data, alpha):
    """
    Calculate Exponential Moving Average (EMA) of a given data series.

    Parameters:
    - data (array-like): The input time series data.
    - alpha (float): Smoothing factor, between 0 and 1.

    Returns:
    - ema (ndarray): EMA values of the input data.
    """
    ema = np.zeros_like(data)
    ema[0] = next(item for item in data if not np.isnan(item))
    for i in range(1, len(data)):
        ema[i] = alpha * data[i] + (1 - alpha) * ema[i-1]
    return ema

def detect_anomalies_ema(time_series_data, alpha, n_std):
    """
    Detect anomalies in a time series using Exponential Moving Average (EMA).

    Anomalies are detected based on residuals (difference between data and EMA)
    exceeding a threshold number of standard deviations.

    Parameters:
    - time_series_data (array-like): The input time series data.
    - alpha (float): Smoothing factor for EMA, between 0 and 1.
    - n_std (float): Number of standard deviations for anomaly detection.

    Returns:
    - anomalies (ndarray of bool): Boolean array indicating anomalies.
    """
    ema = calculate_ema(time_series_data, alpha)
    residuals = time_series_data - ema
    std_dev = np.nanstd(residuals)  # standard deviation of residuals, ignoring NaNs
    
    anomalies = np.zeros_like(time_series_data, dtype=bool).astype(int)
    
    for i in range(len(time_series_data)):
        if np.isnan(time_series_data[i]) or np.abs(residuals[i]) > n_std * std_dev:
            anomalies[i] = 1
    
    return anomalies

alpha = 0.003  # smoothing factor
n_std = 3.44  # number of standard deviations for anomaly detection

all_anomalies = {}
config_ids = all_df.configuration_item_id.unique()

for config_id in config_ids:
    all_anomalies[config_id] = {}
    config_df = all_df[all_df.configuration_item_id == config_id]
    
    for feature in all_features:
        time_series_data = config_df[feature].values
        anomalies = detect_anomalies_ema(time_series_data, alpha, n_std)
        all_anomalies[config_id][feature] = anomalies

In [10]:
test_df = pd.read_csv("data/test.csv")
answers = []

for i, row in test_df.iterrows():
    config_id = row.configuration_item_id
    config_data = all_df[all_df.configuration_item_id == config_id]
    config_data.reset_index(drop=True, inplace=True)
    anomalies_ = list(zip(*all_anomalies[config_id].values()))
    id_in_anomalies = config_data[config_data.id == row.id].index[0]
    answers.append(list(anomalies_[id_in_anomalies]))
    
result_df = pd.DataFrame({"id": test_df.id.values.tolist(), 
                          "target": answers})
result_df.to_csv("case2.csv", index=False)

In [11]:
files = [pd.read_csv(name) for name in ["case_f.csv", "case2.csv"]]

for file in files:
    file["target"] = file["target"].apply(ast.literal_eval)

for i, (t1, t2) in enumerate(zip(files[0].target, files[1].target)):
    for f in range(len(t1)):
        if t1[f] != t2[f]:
            print(files[0].id.iloc[i], f, t1[f], t2[f])