# Modified Jaccard Similarity

## Imports

In [None]:
import pandas as pd
import posixpath
from data_mining_project import data, preprocessing, model_js, validation, PROJECT_PATH, DATA_PATH, OUTPUT_PATH
import numpy as np
import matplotlib as plt
import plotly.express as px
import time
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

## Load Data

In [None]:
file_name = "preprocessed_data.csv"  
file_path = posixpath.join(OUTPUT_PATH, file_name)
data_df = data.load_data_csv(file_path)
data_df.head(5)

In [None]:
data_df.drop(columns=["ac_dc_prob_num", "ac_dc_prob", "seconds_to_incident_sequence", "dj_ac_state_sequence", "dj_dc_state_sequence", "train_kph_sequence", "ac_dc_prob_timestamp"], inplace=True)
data_df = data.reformat_str_to_list(data_df, cols=["events_sequence"], col_type=int)
data_df.head(5)

## Hyperparametric Tuning
### Run the model over different values of t which is the relevance threshold for filtering irrelevant events

In [None]:
ts = np.array([i/10 for i in range(1, 20)])
f1_scores = []
confusion_matrices = []
for i, t in enumerate(ts):
    filtered_data_df = preprocessing.filter_irrelevant_events(data_df, t)
    filtered_data_df = filtered_data_df[filtered_data_df["events_sequence"].apply(lambda row: preprocessing.remove_short_rows(row, x=2))].reset_index(drop=True)
    XY = filtered_data_df.to_numpy()
    X = XY[:, :-1]
    Y = XY[:, -1]
    print(X.shape, Y.shape)
    print(f"relevance threshold: {t}")
    confusion_matrices.append(model_js.loo_js(X, Y))
    f1_score, f1_score_per_class = validation.compute_f1_score(confusion_matrices[i]) 
    print(f"Average f1_score: {f1_score}")
    f1_scores.append((t, f1_score))
    validation.plot_confusion_matrix(confusion_matrices[i], t)

f1_scores = np.array(f1_scores)

## Plot F1-Scores

In [None]:
validation.plot_f1_scores(f1_scores)