# Preprocessing


## Imports

In [None]:
import pandas as pd
import posixpath
from data_mining_project import data, preprocessing, DATA_PATH, OUTPUT_PATH
import numpy as np
import matplotlib as plt
import plotly.express as px

## Load Data

In [None]:
file_name = "sncb_data_challenge.csv"  
file_path = posixpath.join(DATA_PATH, file_name)
data_df = data.load_data_csv(file_path)
data_df.head(5)

In [None]:
data_df = data.reformat_str_to_list(data_df, cols=["vehicles_sequence", "events_sequence", "seconds_to_incident_sequence", "dj_ac_state_sequence", "dj_dc_state_sequence"], col_type=int)
data_df = data.reformat_str_to_list(data_df, cols=["train_kph_sequence"], col_type=float)
data_df.drop(columns=["incident_id", "Column1", "vehicles_sequence", "approx_lat", "approx_lon"], inplace=True)
data_df.head(5)

## Remove events < 4h before the incident and > 10min after the incident

In [None]:
interval= [-3600*4, 360]
data_df = preprocessing.filter_events_out_of_interval(data_df, interval)
data_df.head(5)

## Remove rows shorted than 2 elements

In [None]:
data_df = data_df[data_df["events_sequence"].apply(lambda row: preprocessing.remove_short_rows(row, x=2))].reset_index(drop=True)
data_df.shape

## AC_DC Failure Feature

In [None]:
def indicate_ac_dc_prob_timestamp(row):
    return (row.iloc[0] == row.iloc[1]).astype(int)

def indicate_ac_dc_prob(row):
    return (row.iloc[0] == row.iloc[1]).any().astype(int)

def indicate_ac_dc_num(row):
    return (row.iloc[0] == row.iloc[1]).sum()

#def indicate_ac_dc_prob_events(row):
#    return row.iloc[2][row.iloc[0] == row.iloc[1]]

data_df.insert(1, "ac_dc_prob_timestamp", data_df[["dj_ac_state_sequence", "dj_dc_state_sequence"]].apply(indicate_ac_dc_prob_timestamp, axis=1))
data_df.insert(1, "ac_dc_prob", data_df[["dj_ac_state_sequence", "dj_dc_state_sequence"]].apply(indicate_ac_dc_prob, axis=1))
data_df.insert(1, "ac_dc_prob_num", data_df[["dj_ac_state_sequence", "dj_dc_state_sequence"]].apply(indicate_ac_dc_num, axis=1))
#data_df.insert(1, "ac_dc_prob_events", data_df[["dj_ac_state_sequence", "dj_dc_state_sequence", "events_sequence"]].apply(indicate_ac_dc_prob_events, axis=1))

data_df

## Add hard breaking feature

In [None]:
data_df[['hard_braking','description']] = data_df.apply(preprocessing.detect_braking, axis=1)
data_df.head(5)

## Save data to outputs

In [None]:
filepath = posixpath.join(OUTPUT_PATH, "preprocessed_data.csv")
data.save_data(data_df, filepath)