In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import datetime
from datetime import timedelta
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import apriori
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.min_rows', 30)
pd.set_option('display.max_rows',150)
pd.set_option('display.width', 1000)

# Useful classes to have easier access to data features
class ColumnsInput:
    barcode = 'ddc_barcode'
    ipcode = 'ddc_ipcode'
    machine = 'ddc_mch_code'
    machine_side = 'ddc_mch_side'
    event = 'ddc_ev_subcode'
    time = 'ddc_ev_timestamp'
    
    
class ColumnsOutput:
    c_machine = 'c_machine' 
    event_delta_time = 'event_delta_time'
    day = 'y-m-day'
    month = 'month'

In [2]:
df=pd.read_csv("Data/aborted_jobs_2021.csv", low_memory=False)

#### Analysis of frequent itemset to understand patterns

In [3]:
# limit search on top ten days
# top_3_days = [{'y-m-day': '22-11-12', 'N_tyres_mean': 98.92708333333333, 'index': 24}, {'y-m-day': '22-11-11', 'ddc_barcode': 97.01041666666667, 'index': 23}]
# top_3_days_df_events = []
# for day_ in top_3_days:
#     top_3_days_df_events.append(df_.loc[df_[ColumnsOutput.day] == day_['y-m-day']])

# df = pd.concat(top_3_days_df_events, ignore_index=True)
# print(df.shape)


In [4]:

events_per_barcode = df.groupby([ColumnsInput.barcode])[ColumnsInput.event].unique().apply(list).reset_index()
events_per_barcode

Unnamed: 0,ddc_barcode,ddc_ev_subcode
0,9435412532,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
1,9435412677,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
2,9435412803,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
3,9435412804,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
4,9435413082,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
5,9435413128,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
6,9435413299,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
7,9435413377,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
8,9435413437,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."
9,9435413473,"[LO_LOADER_IN_PRESS, LO_BLADDER_PRESHAPING, LO..."


In [5]:
del df

data_to_pattern = events_per_barcode[ColumnsInput.event].tolist()
data_to_pattern = data_to_pattern[:10]

In [6]:
import numpy as np
from collections import defaultdict
from itertools import combinations

'''
Hash function that defines a permutation for minHash

Arguments:
  x: integer number corresponding to the input to the hash function, i.e. the
    original permutation index(1..n) 
  a: integer number defining the first component of the linear equation
  b: integer number defining the second component of the linear equation 
  c: integer number defining the third component of the linear equation, 
    this parameter guarantees that the result of the hash function is less 
    than c. Generally this number should be a prime number to obtain more 
    evenly distributed permutations, however we generally set this to the 
    number of rows/cols in the utility matrix.
Returns:
  A random permutation of the index x in the range 0-c
'''
# hash function that defines a permutation
def h(x, a, b, c):
  return (a*x+b) % c


'''
This is a modified version of the minHash algorithm adapted to work with 
matrices of integers(not only 0s and 1s). This works by using a modified 
version of the Jaccard similarity that treats a value >= T as 1, 0 otherwise.

Arguments:
  utility_matrix: the utility matrix as a numpy matrix
  k: the number of hash functions for MinHash, corresponding to the number of 
    rows in the signature matrix
  T: the threshold above which the ith entry in the set is considered 1, 
    otherwise 0
Returns:
  The signature matrix with k rows
'''
def minHash(utility_matrix, k, T):
  rows, cols = utility_matrix.shape
  signature_matrix = np.full((k, cols), 0)

  # these hash functions simply generate random perturbations: instead of manually
  # creating random vectors of perturbation it is better to use hash functions 
  # that returns the perturbation in the rows (See chapter 3.3.5 of the book)
  hash_funs = np.random.randint(1, rows, (k,2)) 

  for item in range(cols):
    for hash_i in range(k):
      for user in range(rows):
        random_index = h(user, hash_funs[hash_i][0], hash_funs[hash_i][1], rows)
        if utility_matrix[random_index,item] >= T:
          signature_matrix[hash_i,item] = user+1
          break
  
  return signature_matrix


'''
This is the simHash algorithm implementation.

Arguments:
  utility_matrix: the utility matrix as a numpy matrix
  hyperplanes: the number of random hyperplanes to generate with simHash. This 
    number corresponds to the number of rows that the signature matrix will 
    have
Returns:
  The signature matrix with "hyperplanes" rows
'''
def simHash(utility_matrix, hyperplanes):
  utility_processed = utility_matrix - 50
  utility_matrix[utility_matrix==-51] = 0   # -51 corresponds to the missing values

  rows, cols = utility_matrix.shape
  signature_matrix = np.full((hyperplanes, cols), 0)

  # plane's orthogonal vector with components -0.9,0,0.9
  # random_hyperplanes = np.random.choice([-0.9,0,0.9], size=(hyperplanes, rows)) 
  random_hyperplanes = np.random.uniform(low=-1, high=1, size=(hyperplanes, rows)) 

  for item in range(cols):
    for hyperplane in range(hyperplanes):
      dot_product = np.dot(utility_matrix[:,item], random_hyperplanes[hyperplane])
      if dot_product >= 0:
        signature_matrix[hyperplane, item] = 1
      else:
        signature_matrix[hyperplane, item] = 0
  
  return signature_matrix

'''
This function runs LSH on the signature matrix to find the similar items

Arguments:
  signature_matrix: the signature matrix as a numpy matrix
  bands: the number of bands into which to divide the signature matrix rows(you
    can specify wither "band" or rows_per_band)
  rows_per_band: the number of rows for each band(you can specify wither "band" 
    or rows_per_band)
  
Returns:
  The signature matrix with k rows
'''
def lsh(signature_matrix, bands=None, rows_per_band=None):
  rows, cols = signature_matrix.shape
  if bands:
    r = int(rows/bands)
  else:
    r = rows_per_band

  hashbuckets = defaultdict(set)

  for band_i in range(int(rows/r)): # for each band
    for query in range(cols):
      sliced_query = signature_matrix[band_i*r:(band_i+1)*r, query]
      band_id = tuple(sliced_query.tolist()+[str(band_i)])
      hashbuckets[band_id].add(query)

  # compute the similar items
  similar_items = defaultdict(set)
  for bucket in hashbuckets.values():
    if len(bucket) > 1:
      for pair in combinations(bucket, 2):
        similar_items[pair[0]].add(pair[1])
        similar_items[pair[1]].add(pair[0])

  return similar_items

In [7]:
tr = TransactionEncoder()
tr_arr = tr.fit(data_to_pattern ).transform(data_to_pattern)
data_to_pattern  = pd.DataFrame(tr_arr, columns=tr.columns_)
# raplacing True = 1 and False = 0
data_to_pattern  = data_to_pattern .astype(int)
data_to_pattern 

Unnamed: 0,CL_PRESS_DOWN,CL_PRESS_DOWN_2,CL_PRESS_DOWN_3,CL_PRESS_LOCK,CL_SQUEEZE_ON,CL_STOP_PAUSE,CL_STOP_PAUSE_2,CL_UNLOCK_PRESS,CURING_OFF,CURING_ON,...,UN_CLOSE_ARMS,UN_LMR_TCR_VACUUM,UN_OPEN_ARMS,UN_SWING_IN_ARMS,UN_SWING_POSITION,UN_TCR_UP,UN_TCR_UP_NO_SUV,UN_UNLOADER_DOWN,UN_UNLOADER_OUT,UN_UNLOADER_UP
0,1,1,1,1,1,1,1,1,1,1,...,0,1,0,1,1,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
6,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
7,1,1,1,1,1,1,1,1,1,1,...,0,1,0,0,1,0,0,0,0,0
8,1,1,1,1,1,1,1,1,1,1,...,0,1,0,0,1,0,0,0,0,0
9,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,1


In [8]:

# frequent_itemsets = apriori(data_to_pattern , min_support = 0.6, use_colnames = True)
# frequent_itemsets

simhash = simHash(data_to_pattern.to_numpy(), 1000)
print(simhash)

lsh_ = lsh(simhash, bands=1000)
similar_items = pd.DataFrame.from_dict(lsh_, orient='index')
print(similar_items)


[[1 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 1 1 1]
 ...
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 1 1 1]
 [0 0 0 ... 1 1 1]]
    0   1   2   3   4   5   6   7   8   9   ...  28  29  30  31  32  33  34  35  36  37
0    1   2   3   4   5   6   7   8   9  10  ...  29  30  31  32  33  34  35  36  37  38
1    0   2   3   4   5   6   7   8   9  10  ...  29  30  31  32  33  34  35  36  37  38
2    0   1   3   4   5   6   7   8   9  10  ...  29  30  31  32  33  34  35  36  37  38
3    0   1   2   4   5   6   7   8   9  10  ...  29  30  31  32  33  34  35  36  37  38
4    0   1   2   3   5   6   7   8   9  10  ...  29  30  31  32  33  34  35  36  37  38
5    0   1   2   3   4   6   7   8   9  10  ...  29  30  31  32  33  34  35  36  37  38
6    0   1   2   3   4   5   7   8   9  10  ...  29  30  31  32  33  34  35  36  37  38
7    0   1   2   3   4   5   6   8   9  10  ...  29  30  31  32  33  34  35  36  37  38
8    0   1   2   3   4   5   6   7   9  10  ...  29  30  31  32  33  34  35  36  37  38


In [9]:
similar = apriori(data_to_pattern, min_support=0.6, use_colnames=True, verbose=1)
similar

Processing 380570190 combinations | Sampling itemset size 11