## Loading the data


In [None]:
import pandas as pd
import numpy as np
import ast

# Read the CSV file
df = pd.read_csv('sncb_data_challenge.csv', sep=';')

# Convert the arrays columns to NumPy arrays (for str arrays)
for column in ["vehicles_sequence", "events_sequence"]:
  df[column] = df[column].apply(lambda x: np.array(x.strip("[]").split(", "), dtype=str))

# Convert the arrays columns to NumPy arrays (for float arrays)
for column in ["seconds_to_incident_sequence", "train_kph_sequence"]:
  df[column] = df[column].apply(lambda x: np.array(x.strip("[]").split(", "), dtype=float))

# Convert the arrays columns to NumPy arrays (for bool arrays)
for column in ["dj_ac_state_sequence", "dj_dc_state_sequence"]:
  df[column] = df[column].apply(lambda x: (np.array(x.strip("[]").split(", "), dtype=object) == "True"))


### Removing Events with only one occurrence

In [62]:
data = np.array([])
for row in df['events_sequence']:
  data = np.concatenate((data, row))

typeAll_events_count = pd.DataFrame(pd.DataFrame(data, columns=['event_type']).value_counts().sort_values(ascending=False), columns=['count'])
typeAll_events_count.reset_index(inplace=True)
typeAll_events_count['event_freq'] = typeAll_events_count['event_type'].apply(lambda x: df['events_sequence'].apply(lambda y: x in y).sum()/len(df))

event_type_df = typeAll_events_count.loc[typeAll_events_count['count'] == 1, ['event_type', 'count']]
event_type_array = event_type_df['event_type'].values

In [68]:
def remove_specific_events_from_sequences(df, event_type_array):
    result_df = df.copy()

    for idx, row in result_df.iterrows():
        events = row['events_sequence']
        vehicles = row['vehicles_sequence']
        times = row['seconds_to_incident_sequence']
        speed = row['train_kph_sequence']
        ac_state = row['dj_ac_state_sequence']
        dc_state = row['dj_dc_state_sequence']

        # Get indices of events to remove
        event_indexes_to_remove = find_event_indexes(row, event_type_array)

        # Filter out events and associated sequences
        new_events = [item for i, item in enumerate(events) if i not in event_indexes_to_remove]
        new_vehicles = [item for i, item in enumerate(vehicles) if i not in event_indexes_to_remove]
        new_times = [item for i, item in enumerate(times) if i not in event_indexes_to_remove]
        new_speed = [item for i, item in enumerate(speed) if i not in event_indexes_to_remove]
        new_ac_state = [item for i, item in enumerate(ac_state) if i not in event_indexes_to_remove]
        new_dc_state = [item for i, item in enumerate(dc_state) if i not in event_indexes_to_remove]

        # Check if all sequences have the same length after filtering
        if not (len(new_events) == len(new_vehicles) == len(new_times) == len(new_speed) == len(new_ac_state) == len(new_dc_state)):
            raise ValueError(f"Row {idx}: Sequences have different lengths after filtering.")

        # Update the DataFrame with filtered sequences
        result_df.at[idx, 'events_sequence'] = new_events
        result_df.at[idx, 'vehicles_sequence'] = new_vehicles
        result_df.at[idx, 'seconds_to_incident_sequence'] = new_times
        result_df.at[idx, 'train_kph_sequence'] = new_speed
        result_df.at[idx, 'dj_ac_state_sequence'] = new_ac_state
        result_df.at[idx, 'dj_dc_state_sequence'] = new_dc_state

    # Optionally drop rows where sequences are empty (if needed)
    result_df = result_df[result_df['events_sequence'].str.len() > 0]

    return result_df

df = remove_specific_events_from_sequences(df, event_type_array)


### Dividing by time

In [117]:
def filter_seconds_to_incident(df, lower_bound, upper_bound):
    def filter_sequence(sequence, indices_to_keep):
        """Helper function to filter elements at specific indices from a sequence"""
        sequence = np.array(sequence)
        return np.array(sequence)[indices_to_keep].tolist()

    result_df = df.copy()

    rows_to_drop = []
    for idx, row in result_df.iterrows():
        events = row['events_sequence']
        vehicles = row['vehicles_sequence']
        times = row['seconds_to_incident_sequence']
        speed = row['train_kph_sequence']
        ac_state = row['dj_ac_state_sequence']
        dc_state = row['dj_dc_state_sequence']

        events_array = np.array(events)
        vehicles_array = np.array(vehicles)
        times_array = np.array(times)
        speed_array = np.array(speed)
        ac_state_array = np.array(ac_state)
        dc_state_array = np.array(dc_state)

        indices_to_keep = [i for i, time in enumerate(times) if lower_bound <= time and time <= upper_bound]

        if not indices_to_keep:
            rows_to_drop.append(idx)
        else:
            new_events = events_array[indices_to_keep]
            new_vehicles = vehicles_array[indices_to_keep]
            new_times = times_array[indices_to_keep]
            new_speed = speed_array[indices_to_keep]
            new_ac_state = ac_state_array[indices_to_keep]
            new_dc_state = dc_state_array[indices_to_keep]

            if not (len(new_events) == len(new_vehicles) == len(new_times) == len(new_speed) == len(new_ac_state) == len(new_dc_state)):
                raise ValueError(f"Row {idx}: Sequences have different lengths after filtering.")

            result_df.at[idx, 'events_sequence'] = new_events
            result_df.at[idx, 'vehicles_sequence'] = new_vehicles
            result_df.at[idx, 'seconds_to_incident_sequence'] = new_times
            result_df.at[idx, 'train_kph_sequence'] = new_speed
            result_df.at[idx, 'dj_ac_state_sequence'] = new_ac_state
            result_df.at[idx, 'dj_dc_state_sequence'] = new_dc_state

    result_df = result_df.drop(rows_to_drop)

    return result_df

df_before_incident = filter_seconds_to_incident(df, -14400, 0)
df_after_incident = filter_seconds_to_incident(df, 0, 600)

In [75]:
df_before_incident.head(2)

Unnamed: 0.1,Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
0,0,4432881,"[609, 609, 609, 609, 609, 609, 609, 609, 609, ...","[2744, 4004, 2852, 4110, 2854, 4396, 1132, 414...","[-5510.0, -5510.0, -5507.0, -5507.0, -5506.0, ...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",4
1,1,4432943,"[526, 526, 526, 526, 526, 526, 526, 526, 526, ...","[2744, 4148, 4394, 1566, 1570, 4396, 3634, 412...","[-8573.0, -8573.0, -8032.0, -8032.0, -8032.0, ...",51.037435,4.431218,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 29.1,...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",13


## Getting Common Subsequences
In this part, we try to extract the longest-common subsequences for the least-frequent incident types with the aim of understanding what patterns those incidents have as opposed to the more-frequent incident types. We implement it by manually counting all the n-sized subsequences in the full event sequences and returning the total count of subsequences as well as the percentage of incidents that contain those subsequences.

Unlike other LCSS algorithms, this does not consider non-consecutive patterns. We couldn't make PrefixSpan and other sequence mining algorithms work properly locally. It would be good to clean the data further and figure out how to utilize the known algorithms to extract better sequences which we could possibly use as features for our models.

In [118]:
from collections import Counter

# Find the most common subsequences of length n in a list of event sequences.
def get_common_subsequences_with_percentages(events_list, n=3, top_n=5):
    total_events = len(events_list)
    subsequence_counts = Counter()
    subsequence_event_counts = Counter()

    for events in events_list:
        event_subsequences = set()
        for i in range(len(events) - n + 1):
            subsequence = tuple(events[i:i+n])
            subsequence_counts[subsequence] += 1
            event_subsequences.add(subsequence)
        for subsequence in event_subsequences:
            subsequence_event_counts[subsequence] += 1

    result = []
    for seq, count in subsequence_counts.items():
        percentage = (subsequence_event_counts[seq] / total_events) * 100
        result.append((seq, count, f"{percentage:.2f}%"))

    result.sort(key=lambda x: (float(x[2][:-1]), x[1]), reverse=True)
    return result[:top_n]

In [121]:
# For All Incident Types
subsequence_lengths = [2, 3, 4, 5]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df['events_sequence'], n, 10)
    print(f"Most common event subsequences for all incident types (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences for all incident types (length 2):
('3636', '3658'): 20252 (85.56%)
('2956', '2956'): 262043 (84.87%)
('4066', '3636'): 11374 (80.61%)
('4068', '3636'): 11429 (80.32%)
('2956', '4066'): 11090 (77.35%)
('2956', '4068'): 11106 (75.37%)
('2742', '4026'): 1446 (74.48%)
('2744', '4026'): 1406 (72.60%)
('4026', '4148'): 1450 (69.73%)
('4124', '2956'): 4341 (69.14%)

Most common event subsequences for all incident types (length 3):
('2956', '2956', '2956'): 234419 (84.17%)
('2956', '2956', '4066'): 10892 (76.85%)
('4066', '3636', '3658'): 8740 (76.66%)
('4068', '3636', '3658'): 8619 (75.27%)
('2956', '2956', '4068'): 10908 (75.17%)
('2956', '4066', '3636'): 9721 (72.60%)
('2956', '4068', '3636'): 9720 (71.61%)
('4124', '2956', '2956'): 4124 (67.75%)
('3658', '2956', '2956'): 9358 (67.06%)
('3636', '3658', '2956'): 9511 (66.96%)

Most common event subsequences for all incident types (length 4):
('2956', '2956', '2956', '2956'): 208758 (83.78%)
('2956', '2956', '2

In [78]:
# Isolate Incident 3 from the Others
incident_type_to_analyze = 3
df_specific = df[df['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences for Incident Type 3 (length 2):
('3636', '3658'): 132 (100.00%)
('4066', '3636'): 66 (100.00%)
('4068', '3636'): 56 (100.00%)
('4068', '2708'): 11 (100.00%)
('2742', '4026'): 10 (100.00%)
('2708', '2744'): 10 (100.00%)
('2956', '2956'): 1804 (80.00%)
('3658', '2956'): 67 (80.00%)
('2956', '4066'): 56 (80.00%)
('2956', '4068'): 48 (80.00%)

Most common event subsequences for Incident Type 3 (length 3):
('4066', '3636', '3658'): 59 (100.00%)
('4068', '3636', '3658'): 55 (100.00%)
('2956', '2956', '2956'): 1665 (80.00%)
('3636', '3658', '2956'): 67 (80.00%)
('3658', '2956', '2956'): 62 (80.00%)
('2956', '2956', '4066'): 54 (80.00%)
('2956', '4066', '3636'): 51 (80.00%)
('2956', '2956', '4068'): 47 (80.00%)
('2956', '4068', '3636'): 40 (80.00%)
('4124', '2956', '2956'): 24 (80.00%)

Most common event subsequences for Incident Type 3 (length 4):
('2956', '2956', '2956', '2956'): 1540 (80.00%)
('3636', '3658', '2956', '2956'): 62 (80.00%)
('3658', '2956', '2956

In [79]:
# Isolate Incident 3 from the Others
incident_type_to_analyze = 3
df_specific = df_before_incident[df_before_incident['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences BEFORE INCIDENT for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences BEFORE INCIDENT for Incident Type 3 (length 2):
('3636', '3658'): 121 (100.00%)
('4066', '3636'): 58 (100.00%)
('4068', '3636'): 54 (100.00%)
('4068', '2708'): 9 (100.00%)
('2708', '2744'): 7 (100.00%)
('2956', '2956'): 1652 (80.00%)
('3658', '2956'): 61 (80.00%)
('2956', '4066'): 47 (80.00%)
('2956', '4068'): 45 (80.00%)
('4124', '2956'): 20 (80.00%)

Most common event subsequences BEFORE INCIDENT for Incident Type 3 (length 3):
('4068', '3636', '3658'): 53 (100.00%)
('4066', '3636', '3658'): 52 (100.00%)
('2956', '2956', '2956'): 1532 (80.00%)
('3636', '3658', '2956'): 61 (80.00%)
('3658', '2956', '2956'): 57 (80.00%)
('2956', '2956', '4066'): 46 (80.00%)
('2956', '4066', '3636'): 45 (80.00%)
('2956', '2956', '4068'): 44 (80.00%)
('2956', '4068', '3636'): 38 (80.00%)
('4124', '2956', '2956'): 19 (80.00%)

Most common event subsequences BEFORE INCIDENT for Incident Type 3 (length 4):
('2956', '2956', '2956', '2956'): 1423 (80.00%)
('3636', '3658', '2956'

In [80]:
# Isolate Incident 3 from the Others
incident_type_to_analyze = 3
df_specific = df_after_incident[df_after_incident['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences AFTER INCIDENT for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences AFTER INCIDENT for Incident Type 3 (length 2):
('2956', '2956'): 30 (100.00%)
('3636', '3658'): 4 (75.00%)
('2956', '4066'): 3 (75.00%)
('4066', '3636'): 4 (50.00%)
('4124', '2956'): 2 (50.00%)
('4066', '4066'): 2 (25.00%)
('1872', '4068'): 2 (25.00%)
('2956', '4168'): 2 (25.00%)
('3658', '4124'): 1 (25.00%)
('4124', '4066'): 1 (25.00%)

Most common event subsequences AFTER INCIDENT for Incident Type 3 (length 3):
('2956', '2956', '2956'): 23 (75.00%)
('2956', '2956', '4066'): 3 (75.00%)
('4066', '3636', '3658'): 3 (50.00%)
('4124', '2956', '2956'): 2 (50.00%)
('2956', '4066', '3636'): 2 (50.00%)
('3636', '3658', '4124'): 1 (25.00%)
('3658', '4124', '4066'): 1 (25.00%)
('4124', '4066', '3636'): 1 (25.00%)
('4066', '3636', '4124'): 1 (25.00%)
('3636', '4124', '2956'): 1 (25.00%)

Most common event subsequences AFTER INCIDENT for Incident Type 3 (length 4):
('2956', '2956', '2956', '2956'): 18 (75.00%)
('4124', '2956', '2956', '2956'): 2 (50.00%)
('2956', '

In [112]:
# Isolate Incident 6 from the Others
incident_type_to_analyze = 6
df_specific = df[df['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5, 6]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences for Incident Type 6 (length 2):
('4110', '2854'): 21 (100.00%)
('2708', '2742'): 10 (100.00%)
('2852', '4110'): 16 (83.33%)
('2854', '4026'): 15 (83.33%)
('4168', '4140'): 11 (83.33%)
('2742', '4026'): 11 (83.33%)
('4026', '4148'): 7 (83.33%)
('3636', '3658'): 68 (66.67%)
('4068', '3636'): 23 (66.67%)
('4016', '4026'): 12 (66.67%)

Most common event subsequences for Incident Type 6 (length 3):
('2852', '4110', '2854'): 16 (83.33%)
('4110', '2854', '4026'): 11 (83.33%)
('4068', '3636', '3658'): 23 (66.67%)
('2708', '2742', '4026'): 6 (66.67%)
('2742', '4026', '4148'): 4 (66.67%)
('2956', '2956', '2956'): 553 (50.00%)
('3636', '3658', '2956'): 41 (50.00%)
('3658', '2956', '2956'): 40 (50.00%)
('2956', '2956', '4066'): 35 (50.00%)
('2956', '4066', '3636'): 31 (50.00%)

Most common event subsequences for Incident Type 6 (length 4):
('2852', '4110', '2854', '4026'): 10 (66.67%)
('2956', '2956', '2956', '2956'): 481 (50.00%)
('3636', '3658', '2956', '2956'): 40

In [114]:
# Isolate Incident 6 from the Others
incident_type_to_analyze = 6
df_specific = df_before_incident[df_before_incident['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5, 6]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences BEFORE INCIDENT for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences BEFORE INCIDENT for Incident Type 6 (length 2):
('4110', '2854'): 11 (83.33%)
('2852', '4110'): 9 (83.33%)
('2854', '4026'): 8 (66.67%)
('4016', '4026'): 7 (66.67%)
('2708', '4026'): 6 (66.67%)
('4168', '4140'): 5 (66.67%)
('2956', '2956'): 619 (50.00%)
('3636', '3658'): 65 (50.00%)
('3658', '2956'): 41 (50.00%)
('2956', '4066'): 36 (50.00%)

Most common event subsequences BEFORE INCIDENT for Incident Type 6 (length 3):
('2852', '4110', '2854'): 9 (83.33%)
('4110', '2854', '4026'): 6 (66.67%)
('2956', '2956', '2956'): 542 (50.00%)
('3636', '3658', '2956'): 41 (50.00%)
('3658', '2956', '2956'): 40 (50.00%)
('2956', '2956', '4066'): 35 (50.00%)
('2956', '4066', '3636'): 31 (50.00%)
('4066', '3636', '3658'): 31 (50.00%)
('2956', '2956', '4068'): 28 (50.00%)
('4068', '3636', '3658'): 22 (50.00%)

Most common event subsequences BEFORE INCIDENT for Incident Type 6 (length 4):
('2852', '4110', '2854', '4026'): 6 (66.67%)
('2956', '2956', '2956', '2956'): 471 (50

In [166]:
# Isolate Incident 7 from the Others
incident_type_to_analyze = 7
df_specific = df[df['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5, 17]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences for Incident Type 7 (length 2):
('2956', '2956'): 1650 (100.00%)
('3636', '3658'): 78 (100.00%)
('2956', '4068'): 45 (100.00%)
('3658', '4078'): 36 (100.00%)
('2956', '4066'): 34 (100.00%)
('4066', '3636'): 27 (100.00%)
('4124', '2956'): 20 (100.00%)
('4078', '2956'): 20 (100.00%)
('2682', '2956'): 14 (100.00%)
('2956', '2682'): 13 (100.00%)

Most common event subsequences for Incident Type 7 (length 3):
('2956', '2956', '2956'): 1530 (100.00%)
('2956', '2956', '4068'): 45 (100.00%)
('3636', '3658', '4078'): 36 (100.00%)
('2956', '2956', '4066'): 34 (100.00%)
('4066', '3636', '3658'): 26 (100.00%)
('2956', '4066', '3636'): 25 (100.00%)
('4124', '2956', '2956'): 20 (100.00%)
('4078', '2956', '2956'): 20 (100.00%)
('2682', '2956', '2956'): 13 (100.00%)
('3636', '3658', '2956'): 11 (100.00%)

Most common event subsequences for Incident Type 7 (length 4):
('2956', '2956', '2956', '2956'): 1420 (100.00%)
('2956', '2956', '2956', '4068'): 44 (100.00%)
('2956', 

In [113]:
# Isolate Incident 6 from the Others
incident_type_to_analyze = 6
df_specific = df_after_incident[df_after_incident['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5, 6]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences AFTER INCIDENT for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences AFTER INCIDENT for Incident Type 6 (length 2):
('1640', '1660'): 2 (40.00%)
('3260', '4092'): 2 (40.00%)
('1202', '1202'): 110 (20.00%)
('2934', '1202'): 57 (20.00%)
('1202', '2934'): 55 (20.00%)
('2934', '2934'): 53 (20.00%)
('1566', '1570'): 19 (20.00%)
('1202', '2088'): 14 (20.00%)
('2088', '1202'): 14 (20.00%)
('1202', '1566'): 11 (20.00%)

Most common event subsequences AFTER INCIDENT for Incident Type 6 (length 3):
('1202', '1202', '1202'): 57 (20.00%)
('1202', '2934', '1202'): 28 (20.00%)
('2934', '1202', '2934'): 25 (20.00%)
('2934', '2934', '2934'): 23 (20.00%)
('1202', '2934', '2934'): 21 (20.00%)
('2934', '1202', '1202'): 19 (20.00%)
('1202', '1202', '2934'): 17 (20.00%)
('2934', '2934', '1202'): 16 (20.00%)
('1202', '2088', '1202'): 11 (20.00%)
('1202', '1566', '1570'): 11 (20.00%)

Most common event subsequences AFTER INCIDENT for Incident Type 6 (length 4):
('1202', '1202', '1202', '1202'): 33 (20.00%)
('2934', '1202', '2934', '2934'): 14 (2

In [163]:
# Isolate Incident 16 from the Others
incident_type_to_analyze = 16
df_specific = df[df['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5, 6, 11]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences for Incident Type 16 (length 2):
('2956', '2956'): 640 (100.00%)
('3636', '3658'): 57 (100.00%)
('3658', '2956'): 28 (100.00%)
('2956', '4066'): 23 (100.00%)
('2956', '4068'): 31 (75.00%)
('4068', '3636'): 31 (75.00%)
('4066', '3636'): 22 (75.00%)
('4124', '2956'): 12 (75.00%)
('4120', '2956'): 12 (75.00%)
('3658', '4124'): 11 (75.00%)

Most common event subsequences for Incident Type 16 (length 3):
('2956', '2956', '2956'): 580 (100.00%)
('3636', '3658', '2956'): 28 (100.00%)
('3658', '2956', '2956'): 28 (100.00%)
('2956', '2956', '4066'): 23 (100.00%)
('2956', '2956', '4068'): 31 (75.00%)
('4068', '3636', '3658'): 31 (75.00%)
('2956', '4068', '3636'): 30 (75.00%)
('4066', '3636', '3658'): 22 (75.00%)
('2956', '4066', '3636'): 21 (75.00%)
('4120', '2956', '2956'): 12 (75.00%)

Most common event subsequences for Incident Type 16 (length 4):
('2956', '2956', '2956', '2956'): 521 (100.00%)
('3636', '3658', '2956', '2956'): 28 (100.00%)
('3658', '2956', '295

In [152]:
# Isolate Incident 17 from the Others
incident_type_to_analyze = 17
df_specific = df[df['incident_type'] == incident_type_to_analyze]
subsequence_lengths = [2, 3, 4, 5]
for n in subsequence_lengths:
    common_subsequences = get_common_subsequences_with_percentages(df_specific['events_sequence'], n, 10)
    print(f"Most common event subsequences for Incident Type {incident_type_to_analyze} (length {n}):")
    for seq, count, percentage in common_subsequences:
        print(f"{seq}: {count} ({percentage})")
    print()

Most common event subsequences for Incident Type 17 (length 2):
('2708', '2742'): 24 (100.00%)
('2956', '2956'): 2422 (90.00%)
('2742', '4026'): 28 (90.00%)
('4066', '3636'): 96 (80.00%)
('4120', '2956'): 38 (80.00%)
('3254', '3254'): 29 (80.00%)
('4168', '4140'): 20 (80.00%)
('2708', '4026'): 20 (80.00%)
('4026', '4148'): 19 (80.00%)
('2744', '4026'): 18 (80.00%)

Most common event subsequences for Incident Type 17 (length 3):
('2956', '2956', '2956'): 2174 (90.00%)
('2708', '2742', '4026'): 19 (90.00%)
('4120', '2956', '2956'): 35 (80.00%)
('2858', '2658', '2688'): 15 (80.00%)
('3254', '3254', '3254'): 13 (80.00%)
('2956', '2956', '4068'): 101 (70.00%)
('2956', '2956', '4066'): 92 (70.00%)
('2956', '4068', '3636'): 91 (70.00%)
('2956', '4066', '3636'): 83 (70.00%)
('4406', '4410', '4408'): 12 (70.00%)

Most common event subsequences for Incident Type 17 (length 4):
('2956', '2956', '2956', '2956'): 1938 (90.00%)
('4120', '2956', '2956', '2956'): 34 (80.00%)
('2956', '2956', '2956', '

## Getting Subsequences Across All Incidents

In [None]:
# Find Common Subsequences and their percentage for each incidents
def get_common_subsequences_across_with_percentages(df, n=3, top_n=10):
    total_incidents = len(df)
    subsequence_counts = Counter()
    subsequence_event_counts = Counter()

    incident_type_counts = {}

    for _, row in df.iterrows():
        events = row['events_sequence']
        incident_type = row['incident_type']
        event_subsequences = set()

        for i in range(len(events) - n + 1):
            subsequence = tuple(events[i:i+n])
            subsequence_counts[subsequence] += 1
            event_subsequences.add(subsequence)

        for subsequence in event_subsequences:
            subsequence_event_counts[subsequence] += 1
            if incident_type not in incident_type_counts:
                incident_type_counts[incident_type] = Counter()
            incident_type_counts[incident_type][subsequence] += 1

    result = []

    for seq, count in subsequence_counts.items():
        percentages = {}

        for incident_type, counts in incident_type_counts.items():
            type_count = len(df[df['incident_type'] == incident_type])
            if seq in counts:
                percentage = (counts[seq] / type_count) * 100
            else:
                percentage = 0.0

            percentages[incident_type] = f"{percentage:.2f}%"

        result.append((seq, count, percentages))

    result.sort(key=lambda x: x[1], reverse=True)
    result_df = pd.DataFrame(result, columns=['Subsequence', 'Count', 'Percentages'])
    return result_df.head(top_n)

top_subsequences = get_common_subsequences_across_with_percentages(df, n=3, top_n=10)
print("Most common subsequences:")
for index, row in top_subsequences.iterrows():
    print(f"Subsequence: {row['Subsequence']}, Count: {row['Count']}, Percentages: {row['Percentages']}")



**For Length-2 Subsequences, here are the most common:**

(2956, 2956):
Count: 262043, Percentages: {4: '84.62%', 13: '85.53%', 14: '79.87%', 2: '92.44%', 11: '80.77%', 99: '82.29%', 9: '87.18%', 17: '90.00%', 3: '80.00%', 16: '100.00%', 6: '50.00%', 7: '100.00%'}

(3636, 3658):
Count: 20252, Percentages: {4: '83.33%', 13: '87.74%', 14: '79.19%', 2: '94.12%', 11: '80.77%', 99: '76.00%', 9: '96.58%', 17: '70.00%', 3: '100.00%', 16: '100.00%', 6: '66.67%', 7: '100.00%'}

(4068, 3636):
Count: 11429, Percentages: {4: '78.21%', 13: '83.33%', 14: '75.17%', 2: '91.60%', 11: '80.77%', 99: '62.86%', 9: '95.73%', 17: '70.00%', 3: '100.00%', 16: '75.00%', 6: '66.67%', 7: '75.00%'}

(4066, 3636):
Count: 11374, Percentages: {4: '79.49%', 13: '83.33%', 14: '74.50%', 2: '91.60%', 11: '73.08%', 99: '65.14%', 9: '95.73%', 17: '80.00%', 3: '100.00%', 16: '75.00%', 6: '50.00%', 7: '100.00%'}

(2956, 4068):
Count: 11106, Percentages: {4: '74.36%', 13: '78.62%', 14: '71.14%', 2: '84.03%', 11: '76.92%', 99: '62.86%', 9: '82.91%', 17: '70.00%', 3: '80.00%', 16: '75.00%', 6: '50.00%', 7: '100.00%'}



**For Length-2 Subsequences, here are the most common:**

(2956, 2956, 2956), Count: 234419, Percentages: {4: '84.62%', 13: '85.22%', 14: '79.19%', 2: '90.76%', 11: '80.77%', 99: '80.57%', 9: '87.18%', 17: '90.00%', 3: '80.00%', 16: '100.00%', 6: '50.00%', 7: '100.00%'}


(2956, 2956, 4068), Count: 10908, Percentages: {4: '74.36%', 13: '78.30%', 14: '71.14%', 2: '84.03%', 11: '76.92%', 99: '62.86%', 9: '82.05%', 17: '70.00%', 3: '80.00%', 16: '75.00%', 6: '50.00%', 7: '100.00%'}


(2956, 2956, 4066), Count: 10892, Percentages: {4: '78.21%', 13: '78.30%', 14: '72.48%', 2: '85.71%', 11: '76.92%', 99: '66.29%', 9: '84.62%', 17: '70.00%', 3: '80.00%', 16: '100.00%', 6: '50.00%', 7: '100.00%'}


(2956, 4066, 3636), Count: 9721, Percentages: {4: '74.36%', 13: '74.53%', 14: '67.79%', 2: '80.67%', 11: '69.23%', 99: '60.00%', 9: '83.76%', 17: '70.00%', 3: '80.00%', 16: '75.00%', 6: '50.00%', 7: '100.00%'}


(2956, 4068, 3636), Count: 9720, Percentages: {4: '70.51%', 13: '75.16%', 14: '65.10%', 2: '81.51%', 11: '76.92%', 99: '57.71%', 9: '81.20%', 17: '70.00%', 3: '80.00%', 16: '75.00%', 6: '50.00%', 7: '75.00%'}


(3636, 3658, 2956), Count: 9511, Percentages: {4: '65.38%', 13: '68.24%', 14: '60.40%', 2: '77.31%', 11: '57.69%', 99: '58.29%', 9: '76.07%', 17: '60.00%', 3: '80.00%', 16: '100.00%', 6: '50.00%', 7: '100.00%'}


(3658, 2956, 2956), Count: 9358, Percentages: {4: '66.67%', 13: '67.92%', 14: '61.07%', 2: '77.31%', 11: '57.69%', 99: '58.29%', 9: '76.92%', 17: '60.00%', 3: '80.00%', 16: '100.00%', 6: '50.00%', 7: '75.00%'}


(4066, 3636, 3658), Count: 8740, Percentages: {4: '73.08%', 13: '79.56%', 14: '71.81%', 2: '88.24%', 11: '65.38%', 99: '62.29%', 9: '90.60%', 17: '60.00%', 3: '100.00%', 16: '75.00%', 6: '50.00%', 7: '100.00%'}


(4068, 3636, 3658), Count: 8619, Percentages: {4: '67.95%', 13: '78.62%', 14: '69.80%', 2: '89.08%', 11: '73.08%', 99: '60.00%', 9: '88.03%', 17: '60.00%', 3: '100.00%', 16: '75.00%', 6: '66.67%', 7: '75.00%'}


(4124, 2956, 2956), Count: 4124, Percentages: {4: '66.67%', 13: '68.55%', 14: '63.09%', 2: '72.27%', 11: '69.23%', 99: '62.86%', 9: '74.36%', 17: '60.00%', 3: '80.00%', 16: '75.00%', 6: '50.00%', 7: '100.00%'}




## Checking Specific Subsequences Across All Incidents

In [177]:
def check_multiple_subsequences(df, subsequences):
    total_incidents = len(df)
    incident_counts = df['incident_type'].value_counts().to_dict()
    subsequence_counts = {incident: 0 for incident in incident_counts.keys()}

    for _, row in df.iterrows():
        events = row['events_sequence']
        incident_type = row['incident_type']

        if all(any(np.array_equal(events[i:i+len(subseq)], list(subseq)) for i in range(len(events) - len(subseq) + 1)) for subseq in subsequences):
            subsequence_counts[incident_type] += 1

    percentages = {incident: (count / incident_counts[incident]) * 100 if incident_counts[incident] > 0 else 0
                   for incident, count in subsequence_counts.items()}

    return {
        'subsequences': subsequences,
        'counts': subsequence_counts,
        'percentages': percentages
    }

# subsequences_to_check = [
#     ('2682', '2956', '2956', '2956', '2956'),
#     ('4124', '2956', '2956', '2956', '2956'),
#     ('4078', '2956', '2956', '2956', '2956'),
#      ('2956', '2956', '2956', '2956', '4068'),
#      ('2956', '2956', '4066', '3636', '3658'),
#      # ('4120', '2956', '2956', '2956', '2956')
# ]

# subsequences_to_check = [
#     ('2708', '2742', '4026'),
#     # ('2858', '2658', '2688'),
#     ('4120', '2956', '2956', '2956', '2956')
# ]

subsequences_to_check = [
    ('3636', '3658', '2956'),
    ('3658', '2956', '2956'),
    ('2956', '2956', '4066')
]



result = check_multiple_subsequences(df, subsequences_to_check)

print(f"Subsequences: {result['subsequences']}")
print("Percentages per incident type:")
for incident_type, percentage in result['percentages'].items():
    print(f"{incident_type}: {percentage:.2f}%")


Subsequences: [('3636', '3658', '2956'), ('3658', '2956', '2956'), ('2956', '2956', '4066')]
Percentages per incident type:
13: 66.35%
99: 56.00%
14: 58.39%
2: 75.63%
9: 76.07%
4: 61.54%
11: 53.85%
17: 50.00%
6: 50.00%
3: 80.00%
16: 100.00%
7: 75.00%


## Subsequences occuring for ONLY one Incident Type

In [146]:
def find_subsequences_for_incident_type(df, target_incident_type, n=3, minsup = 0.6, max_other_types=3):
    total_incidents = len(df)
    subsequence_counts = Counter()
    incident_type_counts = {}

    for _, row in df.iterrows():
        events = row['events_sequence']
        incident_type = row['incident_type']
        event_subsequences = set()

        for i in range(len(events) - n + 1):
            subsequence = tuple(events[i:i+n])
            subsequence_counts[subsequence] += 1
            event_subsequences.add(subsequence)

        for subsequence in event_subsequences:
            if subsequence not in incident_type_counts:
                incident_type_counts[subsequence] = Counter()
            incident_type_counts[subsequence][incident_type] += 1

    result = []

    for seq, counts in incident_type_counts.items():
        if target_incident_type in counts and incident_type_counts[seq][target_incident_type] >= (minsup * len(df[df['incident_type'] == target_incident_type])):
            other_types_count = len(counts) - 1

            if other_types_count <= max_other_types:
                result.append((seq, counts[target_incident_type], counts))

    result_df = pd.DataFrame(result, columns=['Subsequence', 'Count', 'Incident Counts'])

    return result_df

incident_types = [3, 6, 7, 16, 17]
seqlen = [3, 4, 5, 6, 7, 8, 9, 10]
minsup = 0.6

for n in seqlen:
  for incident_type in incident_types:
    subsequences_df = find_subsequences_for_incident_type(df, incident_type, n=n, minsup = minsup, max_other_types=5)
    print(f"{n}-length Subsequences primarily occurring in {incident_type}, Minsup = {minsup}")
    for index, row in subsequences_df.iterrows():
      print(f"Subsequence: {row['Subsequence']}, Count: {row['Count']}, Incident Counts: {row['Incident Counts']}")


3-length Subsequences primarily occurring in 3, Minsup = 0.6
3-length Subsequences primarily occurring in 6, Minsup = 0.6
3-length Subsequences primarily occurring in 7, Minsup = 0.6
3-length Subsequences primarily occurring in 16, Minsup = 0.6
3-length Subsequences primarily occurring in 17, Minsup = 0.6
4-length Subsequences primarily occurring in 3, Minsup = 0.6
4-length Subsequences primarily occurring in 6, Minsup = 0.6
4-length Subsequences primarily occurring in 7, Minsup = 0.6
4-length Subsequences primarily occurring in 16, Minsup = 0.6
4-length Subsequences primarily occurring in 17, Minsup = 0.6
5-length Subsequences primarily occurring in 3, Minsup = 0.6
5-length Subsequences primarily occurring in 6, Minsup = 0.6
5-length Subsequences primarily occurring in 7, Minsup = 0.6
5-length Subsequences primarily occurring in 16, Minsup = 0.6
5-length Subsequences primarily occurring in 17, Minsup = 0.6
6-length Subsequences primarily occurring in 3, Minsup = 0.6
6-length Subsequen

In [147]:
for n in seqlen:
  for incident_type in incident_types:
    subsequences_df = find_subsequences_for_incident_type(df_before_incident, incident_type, n=n, minsup = minsup, max_other_types=5)
    print(f"{n}-length Subsequences primarily occurring in {incident_type}, Minsup = {minsup}")
    for index, row in subsequences_df.iterrows():
      print(f"Subsequence: {row['Subsequence']}, Count: {row['Count']}, Incident Counts: {row['Incident Counts']}")

3-length Subsequences primarily occurring in 3, Minsup = 0.6
3-length Subsequences primarily occurring in 6, Minsup = 0.6
3-length Subsequences primarily occurring in 7, Minsup = 0.6
3-length Subsequences primarily occurring in 16, Minsup = 0.6
3-length Subsequences primarily occurring in 17, Minsup = 0.6
4-length Subsequences primarily occurring in 3, Minsup = 0.6
4-length Subsequences primarily occurring in 6, Minsup = 0.6
4-length Subsequences primarily occurring in 7, Minsup = 0.6
4-length Subsequences primarily occurring in 16, Minsup = 0.6
4-length Subsequences primarily occurring in 17, Minsup = 0.6
5-length Subsequences primarily occurring in 3, Minsup = 0.6
5-length Subsequences primarily occurring in 6, Minsup = 0.6
5-length Subsequences primarily occurring in 7, Minsup = 0.6
5-length Subsequences primarily occurring in 16, Minsup = 0.6
5-length Subsequences primarily occurring in 17, Minsup = 0.6
6-length Subsequences primarily occurring in 3, Minsup = 0.6
6-length Subsequen