In [2]:
import pandas as pd
import numpy as np
import datetime
import random
import time
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
def fix_time(time):
    return (datetime.datetime.fromisoformat(time))

# Clustering

In [4]:
def prepare_dataframe_for_clustering(raw_df):
    df = raw_df.copy()
    df['combined_names'] = df['lifecycle:transition'] + ' + ' + df['concept:name']
    
    #Prepare the time between columns
    if type(df['time:timestamp'].iloc[0]) != datetime.datetime:
        df['time:timestamp'] = df['time:timestamp'].apply(fix_time)
    print(df['time:timestamp'].dtype)
    df["time:time_between"] = df["time:timestamp"].diff()
    df.loc[df['step_number'] == 0, 'time:time_between'] = pd.Timedelta(0)
    df["time:time_between"] = [int(x.total_seconds()) for x in df["time:time_between"]]
    
    #Prepare the next event column
    df["next_event"] = df["combined_names"]
    df.loc[df['step_number'] == 0, 'next_event'] = 'editor: close_case'
    df["next_event"] = df["next_event"].shift(-1)
    df.loc[len(df) - 1, 'next_event'] = 'editor: close_case'
    
    return df

In [5]:
def prepare_column_for_clustering(case_id_column, cluster_column, unique_values):
    grouped_df = pd.DataFrame({'case_id': case_id_column, 'column': cluster_column})
    
    for val in unique_values:
        grouped_df[val] = 0
        grouped_df.loc[grouped_df['column'] == val, val] = 1
    
    return grouped_df[['case_id'] + list(unique_values)]

In [6]:
def get_clusters(prepared_rows):
    kmeans = KMeans(n_clusters=NR_OF_CLUSTERS)
    
    unique_columns = list(prepared_rows.columns)
    unique_columns.pop(0)

    df_grouped = prepared_rows.groupby('case_id')[unique_columns].sum()
    kmeans.fit(df_grouped[unique_columns])
    prediction = kmeans.labels_
    unique_case_ids = prepared_rows['case_id'].unique()
    return prepared_rows.case_id.map({unique_case_ids[i]: prediction[i] for i in range(len(prediction))}), kmeans

In [7]:
def predict_clusters(cluster_model, prepared_rows, unique_columns):
    df_grouped = prepared_rows.groupby('case_id')[unique_columns].sum()
    prediction = cluster_model.predict(df_grouped[unique_columns])
    unique_case_ids = prepared_rows['case_id'].unique()
    return prepared_rows.case_id.map({unique_case_ids[i]: prediction[i] for i in range(len(prediction))})

In [8]:
df_train, df_validation, df_test = pd.read_csv('Datasets/trainExample.csv'), pd.read_csv('Datasets/validationExample.csv'), pd.read_csv('Datasets/testExample.csv')
cluster_train, cluster_test = prepare_dataframe_for_clustering(df_train), prepare_dataframe_for_clustering(df_test)

object
datetime64[ns, UTC+01:00]


In [9]:
NR_OF_CLUSTERS = 5

cluster_columns = ['concept:name']
assigned_columns = [cluster_columns[i] + ' cluster' for i in range(len(cluster_columns))]

cluster_train[cluster_columns[0]] = cluster_train[cluster_columns[0]].fillna(-1)
cluster_test[cluster_columns[0]] = cluster_test[cluster_columns[0]].fillna(-1)

unique_values = list(cluster_train[cluster_columns[0]].unique())
unique_values_test = cluster_test[cluster_columns[0]].unique()
unique_values.extend(x for x in unique_values_test if x not in unique_values)

prepared_train_data = prepare_column_for_clustering(cluster_train['case_id'], cluster_train[cluster_columns[0]], unique_values)
cluster_train[assigned_columns[0]], cluster_model = get_clusters(prepared_train_data)

prepared_test_data = prepare_column_for_clustering(cluster_test['case_id'], cluster_test[cluster_columns[0]], unique_values)
cluster_test[assigned_columns[0]] = predict_clusters(cluster_model, prepared_test_data, unique_values)

cluster_train['time:time_between'] = cluster_train['time:timestamp'].diff()
cluster_train.loc[cluster_train['step_number'] == 0, 'time:time_between'] = pd.Timedelta(0)

cluster_test['time:time_between'] = cluster_test['time:timestamp'].diff()
cluster_test.loc[cluster_test['step_number'] == 0, 'time:time_between'] = pd.Timedelta(0)

# Markov

In [10]:
def creating_dict_for_next_step_stats (df : pd.DataFrame, concept_name : str) -> dict:
    '''For an input action checks for all the possible next actions and counts their occurence'''
    
    thisdf = df.reset_index().copy()
    dic_occurrence = {}
    dic_total_time = {}
    ids = list(thisdf['case_id']) + ['editor: last id'] #Otherwise we check i+1-th position that does not exist
    times = list(thisdf['time:time_between']) + [pd.Timedelta(0)] #Otherwise we check i+1-th position that does not exist
    names = thisdf['concept:name']
    df_concept = thisdf[names == concept_name]
    
    for i, row in df_concept.iterrows():
        
        if (ids[i] == ids[i+1]): #an instance of the same case
            
            if (names[i+1] not in dic_occurrence):
                dic_occurrence[names[i+1]] = 1
                dic_total_time[names[i+1]] = times[i+1]
            else:
                dic_occurrence[names[i+1]] += 1
                dic_total_time[names[i+1]] += times[i+1]
                
        else: #the last instance of the case
            
            if ('editor: close_case' not in dic_occurrence):
                dic_occurrence['editor: close_case'] = 1
                dic_total_time['editor: close_case'] = times[i+1]
            else:
                dic_occurrence['editor: close_case'] += 1
                dic_total_time['editor: close_case'] += times[i+1]
    
    #Compute average time
    dic_avg_time = {}
    
    for key in dic_total_time:
        dic_avg_time[key] = dic_total_time[key] / dic_occurrence[key]
        
    return(dic_occurrence, dic_avg_time)

In [11]:
names = cluster_train['concept:name'].unique().tolist() + ['editor: close_case']
shape = len(names)

transitionMatrix = []
for cluster in range( NR_OF_CLUSTERS ):
    transitionMatrix.append( np.zeros( ( shape, shape ) ) )

    for name in names:
        nextSteps = creating_dict_for_next_step_stats(cluster_train, name)[0]
        sumOfSteps = sum(nextSteps.values())

        for step in nextSteps:
            x = names.index(name)
            y = names.index(step)

            probability = nextSteps[step] / sumOfSteps

            transitionMatrix[cluster][x, y] = probability
    
    print( cluster )


0
1
2
3
4


In [12]:
print('Names: ', names, '\n\nMatrix: \n', transitionMatrix[3])

Names:  ['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'W_Completeren aanvraag', 'O_SELECTED', 'A_FINALIZED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_SENT_BACK', 'W_Valideren aanvraag', 'A_APPROVED', 'O_ACCEPTED', 'A_ACTIVATED', 'A_ACCEPTED', 'O_CANCELLED', 'A_DECLINED', 'A_PREACCEPTED', 'A_CANCELLED', 'W_Afhandelen leads', 'O_DECLINED', 'W_Nabellen incomplete dossiers', 'A_REGISTERED', 'W_Beoordelen fraude', 'W_Wijzigen contractgegevens', 'editor: close_case'] 

Matrix: 
 [[0.00000000e+00 8.01249512e-01 1.28855916e-02 0.00000000e+00
  1.30157491e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 2.60314981e-04 0.00000000e+00 4.51646492e-02
  5.85708708e-02 0.00000000e+00 6.91136275e-02 0.00000000e+00
  0.00000000e+00 0.00000000e+00 3.90472472e-04 0.00000000e+00
  1.22348041e-02]
 [0.00000000e+00 0.00000000e+00 6.81493042e-02 0.00000000e+00
  1.30055924e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000

In [20]:
def predictNext(current, cluster):
    
    nextEvent = np.random.choice(a = names, p = transitionMatrix[cluster][names.index(current)])
    
    return nextEvent

O_CREATED
O_SELECTED
O_SELECTED
O_CREATED
O_SELECTED


In [None]:
cluster_train[cluster_train['concept:name cluster'] == 5]
df_prediction['concept:name']

In [None]:
for counter in range(NR_OF_CLUSTERS):
    df_prediction = cluster_train[cluster_train['concept:name cluster'] == counter].reset_index()
    
    nextEvents = []

    for i in range(df_prediction.shape[0]):
        nextEvents.append(predictNext(df_prediction['concept:name'][i], counter))

    df_prediction['Prediction'] = nextEvents

    #Prepare the next event column
    df_prediction['next_event'] = df_prediction['concept:name']
    df_prediction.loc[df_prediction['step_number'] == 0, 'next_event'] = 'editor: close_case'
    df_prediction['next_event'] = df_prediction['next_event'].shift(-1)
    df_prediction.loc[len(df) - 1, 'next_event'] = 'editor: close_case'

    df_prediction.head()
    
    accuracy = round((len(df_prediction[df_prediction['next_event'] == df_prediction['Prediction']])/len(df_prediction))*100, 1)
    print('accuracy cluster ' + str(counter) + ': ' + str(accuracy) + '%')