In [None]:
import os
import pandas as pd

In [None]:
from statistics import median

import plotly.graph_objects as go

def transition_to_nodes_and_links_v2(input_df, name_label='name', count_label='count'):
    name_count_df = input_df.copy()
    transition_to_metadata = {}

    # for name_label column, append index to each element
    name_count_df[name_label + '_seq'] = name_count_df[name_label].apply(
        lambda x: [x[i] + ' -' + str(len(x) - 1 - i) for i in reversed(range(len(x)))]
    )

    for row in name_count_df.iterrows():
        journey = row[1][name_label + '_seq']
        averages = row[1]['times']
        for i in range(len(journey) - 1):
            if (journey[i], journey[i + 1]) in transition_to_metadata:
                transition_to_metadata[(journey[i], journey[i + 1])]["count"] += row[1][count_label]
                transition_to_metadata[(journey[i], journey[i + 1])]["times"].extend(averages[i])
            else:
                transition_to_metadata[(journey[i], journey[i + 1])] = {"count": row[1][count_label], "times": averages[i]}

    nodes = list(set([transition[0] for transition in transition_to_metadata.keys()] + [transition[1] for transition in transition_to_metadata.keys()]))
    links = [{'source': transition[1], 'target': transition[0],
              'value': metadata["count"], "median": median(metadata["times"])} for transition, metadata in transition_to_metadata.items()]
    return nodes, links

In [None]:
timeline_df = pd.read_csv("../output/final_output.csv")
timeline_df = timeline_df.drop_duplicates()
timeline_df

In [None]:
timeline_exclude_minutes_df = timeline_df[((timeline_df["exact_timing"] > 1) & (timeline_df["exact_timing"] < 17520)) | (timeline_df["exact_timing"] < -1)]
timeline_exclude_minutes_df

In [None]:
len(timeline_exclude_minutes_df.uid.unique())

In [None]:
import numpy as np

timeline_grouped_df = timeline_exclude_minutes_df.groupby("uid").agg(tuple).reset_index().groupby("category").agg(list).reset_index()
timeline_grouped_df["cnt"] = timeline_grouped_df.exact_timing.apply(len)
timeline_grouped_df.columns = ["journey", "uid", "times", "cnt"]

def fix_journey(row):
    # if the journey does not contain Death of victim as the final state, then add it
    if row["journey"][-1] != "Death of victim":
        return row["journey"] + ("Death of victim",)
    else:
        return row["journey"]
    
def fix_times(row):
    new_times = []
    for time in row["times"]:
        max_number = max(time)
        if max_number > 0:
            new_times.append([-x if x > 0 else x for x in time])
        else:
            new_times.append(time)
    # if the journey does not contain Death of victim as the final state, then add it
    if row["journey"] and row["journey"][-1] != "Death of victim":
        return new_times + [0]
    return new_times

timeline_grouped_df["journey"] = timeline_grouped_df.apply(fix_journey, axis=1)
timeline_grouped_df["times_fixed"] = timeline_grouped_df.apply(fix_times, axis=1)
timeline_grouped_df["times"] = timeline_grouped_df["times_fixed"].apply(lambda x: np.transpose(x).tolist())

In [None]:
timeline_grouped_df = timeline_grouped_df.sort_values("cnt", ascending=False)

In [None]:
timeline_grouped_df

In [None]:
timeline_grouped_trunc_4 = timeline_grouped_df.copy()
timeline_grouped_trunc_4["journey"] = timeline_grouped_trunc_4["journey"].apply(lambda x: x[-4:])

In [None]:
nodes, links = transition_to_nodes_and_links_v2(
    timeline_grouped_trunc_4, name_label="journey", count_label='cnt')
fig = go.Figure(data=[go.Sankey(
    node=dict(pad=15, thickness=20, line=dict(color='black', width=0.5), label=nodes),
    link=dict(source=[nodes.index(link['source']) for link in links],
              target=[nodes.index(link['target']) for link in links],
              label=[f'Median days for transition: {link["median"] / 24}' for link in links],
              value=[link['value'] for link in links if link["value"] > 30])
)])
fig.show()

In [None]:
timeline_grouped_df[timeline_grouped_df["journey"].apply(lambda x: "History of suicide attempt" in x)]

In [None]:
one_attempt_list = timeline_grouped_df[timeline_grouped_df["journey"] == ("History of suicide attempt", "Death of victim")].times.tolist()
# recursively flatten one_attempt_list
def flatten(S):
    if S == []:
        return S
    if isinstance(S[0], list):
        return flatten(S[0]) + flatten(S[1:])
    return S[:1] + flatten(S[1:])

one_attempt_list = flatten(one_attempt_list)

In [None]:
import matplotlib.pyplot as plt

plt.hist([abs(x) // 24 for x in one_attempt_list if 60 <= abs(x) // 24 <= 720])
plt.xlabel('Days')
plt.ylabel('Frequency')
plt.title('Days until suicide when SuicideAttempt = 1')
plt.show()

In [None]:
many_attempt_list = timeline_grouped_df[timeline_grouped_df["journey"].apply(lambda x: x.count("History of suicide attempt") == 2)].times.tolist()
many_attempt_list = flatten(many_attempt_list)

plt.hist([abs(x) // 24 for x in many_attempt_list if 30 <= abs(x) // 24 <= 720])
plt.xlabel('Days')
plt.ylabel('Frequency')
plt.title('Days until suicide when SuicideAttempt > 1')
plt.show()

In [None]:
timeline_grouped_df[timeline_grouped_df["journey"].apply(lambda x: x[-3:] == ("History of suicide attempt", "History of suicide attempt", "Death of victim"))]