In [257]:
import pandas as pd   
from scipy.spatial.distance import cosine as cosine_distance
import numpy as np
import datetime as dt
import pickle 


In [258]:
import pickle 

keyed_embeddings = None 
transaction_embeddings = None

with open('data\keyed_alarm_embeddings.pickle','rb') as f:
  keyed_embeddings = pickle.load(f)

with open('data/transaction_embeddings.pickle', 'rb') as f:
  transaction_embeddings = pickle.load(f)

In [259]:
class RCA:
  def __init__(self, distinct_alarms, transactions):
    self.transactions = {}
    for tid in transactions:
      self.transactions[tid] = transactions[tid]['ALR']
    self.transaction_count = float(len(transactions))

    self.alr_transction_probabilities = {}
    for alr in distinct_alarms:
      count = 0
      for tid in transactions:
        if alr in self.transactions[tid]:
          count += 1
      self.alr_transction_probabilities[alr] = count / self.transaction_count
      
  def _PofList(self, arl_list):
    d_list = list(set(arl_list))
    list_len = len(d_list)
    count = 0
    for tid in self.transactions:
      items_in_tid = 0
      for item in d_list:
        if item in self.transactions[tid]:
          items_in_tid += 1
      if items_in_tid == list_len:
        count +=1
    if count == 0:
      print(f'*******\n{d_list}\n*****')
    return count / self.transaction_count

  def ScoreTransaction(self, transaction):
    Parr = [self.alr_transction_probabilities[t] for t in transaction['ALR']]
    rca = {'EventId': transaction['EventId'], 'ALR': transaction['ALR']}
    Pcond =[]
    pAll = self._PofList(transaction['ALR'])
    for i, pA in enumerate(Parr):
      tCopy = [x for x in transaction['ALR']]
      tCopy.pop(i)
      pB = self._PofList(tCopy)
      if pB==0:
        Pcond.append(1.0e-26)
      else:
        Pcond.append(  pA * pAll/pB )

    rca['RCA_SCORE'] = Pcond
    return rca

  def ScoreCluster(self, alarm_list, alr_idx):
    if len(alarm_list) <= 1:
      return [1]
    Parr = []
    alrms = [t[alr_idx] for t in alarm_list]
    for alr in alrms:
      if alr not in self.alr_transction_probabilities:
        Parr.append(1.0e-26)
      else: 
        Parr.append(self.alr_transction_probabilities[alr])
    #print(Parr)
    Pcond =[]
    pAll = self._PofList( [t[alr_idx] for t in alarm_list] )
    for i, pA in enumerate(Parr):
      tCopy = [t[alr_idx] for t in alarm_list]
      #print(tCopy)
      tCopy.pop(i)
      pB = self._PofList(tCopy)
      if pB==0:
        Pcond.append(1.0e-26)
      else:
        Pcond.append(  pA * pAll/pB )
    return Pcond
    


In [265]:
alr_df = pd.read_csv('data/final_clustering.csv')
test_df = pd.read_csv('data/AlarmTestSet.csv')
node_centraliity_df = pd.read_csv('data/object_centrality.csv')



In [267]:
node_score = {}
for node, score in node_centraliity_df[['ObjectName','Centrality']].values:
  node_score[node] = score

In [263]:
test_df['EventDt'] = pd.to_datetime(test_df['EventDt'])
test_df['ClearDt'] = pd.to_datetime(test_df['ClearDt'])
test_df['EVENT_DT_epoch'] = (test_df['EventDt'] - dt.datetime(1970,1,1)).dt.total_seconds()
test_df['EVENT_CLR_epoch'] = (test_df['ClearDt']- dt.datetime(1970,1,1)).dt.total_seconds()

test_df.sort_values('EVENT_DT_epoch', inplace=True)

In [264]:
alr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12839 entries, 0 to 12838
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   EventId                 12839 non-null  int64  
 1   ObjectName              12839 non-null  object 
 2   EventDt                 12839 non-null  object 
 3   ClearDt                 12839 non-null  object 
 4   ScaledObjectCategoryId  12839 non-null  int64  
 5   ObjectCategory          12839 non-null  object 
 6   Severity                12839 non-null  object 
 7   SUBJECT                 12744 non-null  object 
 8   sentence                12839 non-null  object 
 9   EVENT_DT_epoch          12839 non-null  int64  
 10  EVENT_CLR_epoch         12839 non-null  int64  
 11  Duration                12839 non-null  float64
 12  ClusterLabel            12839 non-null  int64  
 13  TransactionClusterId    12839 non-null  int64  
 14  Unnamed: 0              12517 non-null

In [219]:

alr_list = list(set(alr_df['sentence'].values)) + list(set(test_df["sentence"].values))


data = []
for alr in alr_list:
  temp = []
  for j in word_tokenize(alr):
    temp.append(j.lower())
  data.append(temp)

In [220]:
embedding_model = gensim.models.Word2Vec(data, min_count=1, vector_size=300, window=5)


In [221]:
wordVecs = embedding_model.wv

In [222]:
data_vectors = []
for record in data:
  v = [wordVecs.get_vector(word) for word in record]
  v = np.array(v).mean(axis=0)

  data_vectors.append(v)


transaction_embeddings = {}
for i, record in enumerate(alr_df[['TransactionClusterId', 'sentence']].values):
  k = record[0]
  if k not in transaction_embeddings:
    transaction_embeddings[k] = []

  sent = record[1]
  target = [wordVecs.get_vector(word.lower()) for word in word_tokenize(sent)]
  target = np.array(target).mean(axis=0)
 
  transaction_embeddings[k].append(target)

In [223]:
for k in transaction_embeddings:
  transaction_embeddings[k] = np.array(transaction_embeddings[k]).mean(axis=0)

In [265]:
transction_ids = set(alr_df['TransactionClusterId'].values) 


number_of_transactions = len(transction_ids)

distinct_alarms = set(alr_df['SUBJECT'].values)

In [266]:
transaction_alarms = {}
for tid in transction_ids:
  transaction_alarms[tid] = {'EventId':[], 'ALR': []}
for eid, tid, alr in alr_df[['EventId','TransactionClusterId','SUBJECT']].values:
  transaction_alarms[tid]['EventId'].append(eid)
  transaction_alarms[tid]['ALR'].append(alr)

In [290]:
RcaAlgo = RCA(distinct_alarms, transaction_alarms)

In [227]:
def ClassifyAlarm(sentVec, embeddings):
  classification = None
  min = 9999
  for k in embeddings:
    d = cosine_distance(v, embeddings[k])
    if d < min:
      min = d
      classification = k
  return classification, d

In [228]:
def GetRca(clusterList, alr_idx, centrality_idx):
  scores = RcaAlgo.ScoreCluster(clusterList, alr_idx)
  for i,s in enumerate(scores):
    
  return scores

  

In [291]:
current_vec = []
current_cluster = []
prev_time = 0
prev_dist = 1
current_correlationId = 0
epsilon = 0.1
min_time = 7.0
clustered_df_dic = {'EventId': [], "ClusterId": [], "RCA_Score": [], "Class": [], "DistanceToClass":[]}
for  eventId, eventDt, ClearDt, obj, sentence, subj in test_df[["EventId","EVENT_DT_epoch","EVENT_CLR_epoch","ObjectName","sentence", 'SUBJECT']].values[0:100]:
  
  t_delta = eventDt - prev_time
  prev_time = eventDt
  
  obj_centrality = 0
  if obj in node_score:
    obj_centrality = node_score[obj]

  for v in [wordVecs.get_vector(word.lower()) for word in word_tokenize(sentence)]:
    current_vec.append(v)
  new_vec = np.array(v).mean(axis=0)
  
  k, d = ClassifyAlarm(new_vec, transaction_embeddings)
  current_cluster.append([eventId, subj, current_correlationId, k, d, obj_centrality])

  if d >  epsilon or t_delta > min_time:

    scores = GetRca(current_cluster, 1)

    for i, score in enumerate(scores):
      clustered_df_dic['EventId'].append(current_cluster[i][0])
      clustered_df_dic['ClusterId'].append(current_correlationId)
      clustered_df_dic['RCA_Score'].append(score)
      clustered_df_dic['Class'].append(current_cluster[i][3])
      clustered_df_dic['DistanceToClass'].append(current_cluster[i][4])

    current_correlationId += 1
    current_cluster = []
    current_timedeltas = []
    current_vec = []
  
  prev_dist = d
  #print(sentence, k, d, t_delta, current_correlationId)
  #print(eventId, eventDt, ClearDt, Severity, subject)
  

*******
['AccessInterfaceDown - LXAccessInterface', 'TunnelDown - Tunnel', 'SdpBindingDown - SpokeSdpBinding', 'LabelProblem - SpokeSdpBinding', 'InterfaceNeighborDown - Interface', 'ServiceSiteDown - Site', 'LspPathDown - LspPath', 'ScheduledPollerProblem', 'SdpBindingTunnelDown - SpokeSdpBinding', 'PowerSupplyRemoved - PowerSupplyTray']
*****
*******
['AccessInterfaceDown - LXAccessInterface', 'TunnelDown - Tunnel', 'SdpBindingDown - SpokeSdpBinding', 'LabelProblem - SpokeSdpBinding', 'InterfaceNeighborDown - Interface', 'ServiceSiteDown - Site', 'LspPathDown - LspPath', 'ScheduledPollerProblem', 'SdpBindingTunnelDown - SpokeSdpBinding', 'PowerSupplyRemoved - PowerSupplyTray']
*****
*******
['AccessInterfaceDown - LXAccessInterface', 'TunnelDown - Tunnel', 'SdpBindingDown - SpokeSdpBinding', 'LabelProblem - SpokeSdpBinding', 'InterfaceNeighborDown - Interface', 'ServiceSiteDown - Site', 'LspPathDown - LspPath', 'ScheduledPollerProblem', 'SdpBindingTunnelDown - SpokeSdpBinding', 'Powe

In [275]:
test_result_df =  pd.DataFrame(clustered_df_dic)

In [277]:
test_df.join(test_result_df.set_index('EventId'), on='EventId', how='inner').set_index('EventId').to_csv('data/test_result.csv')

In [294]:
RcaAlgo.alr_transction_probabilities['SdpBindingDown - SpokeSdpBinding']

0.08626919602529359

In [297]:
clster = [
['SdpBindingDown - SpokeSdpBinding'],
['AccessInterfaceDown - LXAccessInterface'],
['AccessInterfaceDown - LXAccessInterface'],
['TunnelDown - Tunnel'],
['SdpBindingTunnelDown - SpokeSdpBinding'],
['ServiceSiteDown - Site'],
['SdpBindingDown - SpokeSdpBinding'],
['SdpBindingTunnelDown - SpokeSdpBinding'],
['SdpBindingDown - SpokeSdpBinding'],
['LspPathDown - LspPath'],
['ServiceSiteDown - Site'],
['InterfaceNeighborDown - Interface'],
['SdpBindingDown - SpokeSdpBinding'],
['SdpBindingDown - SpokeSdpBinding'],
['LabelProblem - SpokeSdpBinding'],
['LabelProblem - SpokeSdpBinding'],
['PowerSupplyRemoved - PowerSupplyTray'],
]
#GetRca(clster, 0)
RcaAlgo._PofList([x[0] for x in clster])



0.00045167118337850043

In [68]:

cluster_ids = set(clstr_df['ClusterLabel'].values) 

cluster_alarms = {}
for tid in cluster_ids:
  cluster_alarms[tid] = {'EventId':[], 'ALR': []}
for eid, td, alr in clstr_df[['EventId','ClusterLabel','SUBJECT']].values:
  cluster_alarms[td]['EventId'].append(eid)
  cluster_alarms[td]['ALR'].append(alr)

In [75]:
rca = RcaAlgo.ScoreTransaction(cluster_alarms[0])
for tid in cluster_alarms:
  if tid == 0:
    continue
  next_rca = RcaAlgo.ScoreTransaction(cluster_alarms[tid])
  for key in next_rca:
    for value in next_rca[key]:
      rca[key].append(value)

KeyboardInterrupt: 

In [58]:
#print(transaction_alarms[255])
rca_df = pd.DataFrame(rca)

In [59]:
alr_df.join(rca_df.set_index('EventId'), on='EVENT_ID', lsuffix='_')[
  [
    "EVENT_ID",
    "EVENT_DT",
    "CLEAR_DT",
    "Domain",
    "ALARM_OBJ",
    "SEVERITY",
    "Description",
    "ClusterLabel",
    "TransactionClusterId",
    "RCA_SCORE"
    ]
  ].to_csv('data/clusteredWithRCA_2.csv')