# Preprocessing


## Imports

In [None]:
import pandas as pd
import posixpath
import data
import eda
import numpy as np
import matplotlib as plt
import plotly.express as px

DATA_PATH = "C:\\Users\\derar\\Documents\\Data Mining Project\\Code\\data"

## Load Data

In [None]:
file_name = "sncb_data_challenge.csv"  
file_path = posixpath.join(DATA_PATH, file_name)
data_df = data.load_data_csv(file_path)

In [None]:
data_df

## Event filtering

In [None]:
data_df.columns.to_list()

In [None]:
def compute_events_freq(events_sequence):
    unique_events, events_count = np.unique(events_sequence, return_counts=True)
    total_events_count = np.sum(events_count)
    events_freq = events_count / total_events_count
    events_freq = np.column_stack((unique_events, events_freq))
    events_freq = {int(k): v for k, v in events_freq}
    return events_freq

def compute_events_to_remove(data_df, events_freq_all_classes, t):
    events_to_remove = {}
    events_freq_per_incident = {}
    for incident_type, events_sequences in list(data_df.groupby("incident_type")["events_sequence"]):
        events_sequence_per_incident = np.concatenate(events_sequences.to_list())
        events_freq_per_incident[incident_type] = compute_events_freq(events_sequence_per_incident)

    for incident_type, events_freq in events_freq_per_incident.items():
        events_to_remove[incident_type] = []
        for event_id, event_freq in events_freq.items():
            if event_freq/events_freq_all_classes[event_id] < t:
                events_to_remove[incident_type].append(event_id) 

    return events_to_remove

def remove_event_from_incidents(data_df, events_to_remove_per_incident, allowed_event=None):
    def remove_events(row, list_columns_indices, events_to_remove_per_incident, allowed_event=None):
        row_index = row.iloc[0]
        incident_type = row.iloc[-1]
        
        print(row_index)
        events_to_remove = events_to_remove_per_incident[incident_type]
        for event_id in events_to_remove:
            removed_event_indices = row.iloc[3] == event_id 
            for col in list_columns_indices:
                row.iloc[col] = row.iloc[col][~removed_event_indices]
        return row
    
    filtered_data_df = data_df.copy()
    list_columns = ["vehicles_sequence", "events_sequence", "seconds_to_incident_sequence", "train_kph_sequence", "dj_ac_state_sequence", "dj_dc_state_sequence"]
    list_columns_indices = []
    for el in list_columns:
        list_columns_indices.append(data_df.columns.to_list().index(el))
        
    filtered_data_df = filtered_data_df.apply(lambda row: remove_events(row, list_columns_indices, events_to_remove_per_incident, allowed_event), axis=1)
    return filtered_data_df
    

def filter_events(data_df, t, allowed_event=None):
    events_sequences_all_classes = np.concatenate(list(data_df["events_sequence"]))
    events_freq_all_classes = compute_events_freq(events_sequences_all_classes)
    events_to_remove_per_incident = compute_events_to_remove(data_df, events_freq_all_classes, t)
    filtered_data_df = remove_event_from_incidents(data_df, events_to_remove_per_incident, allowed_event)
    return filtered_data_df
    
filtered_data_df = filter_events(data_df, 2)

In [None]:
filtered_data_df["events_sequence"] == data_df["events_sequence"]

In [None]:
def lcss(data_df, time_interval,incident_type):
    sub_data= data_df[data_df['incident_type'] == incident_type]
    subsequences_per_row = {}  # Dictionary to store subsequences per row
    events_sequence = sub_data['events_sequence'].to_numpy()  # Convert to NumPy array
    time_sequence = sub_data['seconds_to_incident_sequence'].to_numpy()  # Convert timestamps to NumPy array
    
    start_time, end_time = time_interval  # Extract start and end of interval

    for row_idx, (events, times) in enumerate(zip(events_sequence, time_sequence)):
        list_of_ss = []  # List to store subsequences for the current row
        ss = []  # Temporary list to build subsequences
        
        # Filter events based on time interval and duplicate removal
        filtered_events = [event for event, time in zip(events, times) if start_time <= time <= end_time]
        ss_list = [filtered_events[0]]
        for event in filtered_events:
            if event != ss_list[-1]:
                ss_list.append(event) 

        # Generate all possible subsequences
        all_ss_list_wd = []
        n = len(ss_list)
        for i in range(n):
            for j in range(i + 1, n + 1):
                if len(ss_list[i:j]) >= 2:
                    all_ss_list_wd.append(ss_list[i:j])

        # Duplicate removal for all subsequences
        all_ss_list = [all_ss_list_wd[0]] if all_ss_list_wd else []
        for subsequence in all_ss_list_wd[1:]:
            if subsequence not in all_ss_list:
                all_ss_list.append(subsequence)

        # Store subsequences for the current row
        subsequences_per_row[row_idx] = all_ss_list

    # Find the longest common subsequence (LCSS)
    if len(subsequences_per_row) == 0:
        return None  # If no sequences are available

    checking_list = subsequences_per_row[0]
    common_subsequences = []
    for subsequence in checking_list:
        is_common = True
        for i in range(1, len(subsequences_per_row)):
            if subsequence not in subsequences_per_row[i]:
                is_common = False
                break
        if is_common:
            common_subsequences.append(subsequence)

    if len(common_subsequences) == 0:  # If no common subsequences are found
        return None
    else:
        longest_common_subsequence = common_subsequences[0]
        for subsequence in common_subsequences:
            if len(subsequence) > len(longest_common_subsequence):
                longest_common_subsequence = subsequence
        return longest_common_subsequence

