In [125]:
import json
import os
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np
from sklearn.cluster import DBSCAN
from collections import defaultdict
from numpy.linalg import norm

In [126]:
def load_dataset(path_data):
    with open(path_data, 'r') as file:
        contents = json.load(file)
    # end
    
    return [content['processed'] for content in contents], contents
# end

def write_output(path_output, contents):
    os.mkdirs(path_output, exists_ok=True)
    
    with open(path_output, 'w+') as file:
        file.write(json.dumps(contents))
    # end
# end


def cosine(v_1, v_2):
    return np.dot(v_1, v_2) / (norm(v_1) * norm(v_2))
# end

In [17]:
def main_embed(embedder, folder_source, folder_target, version_current, type_current):
    filename_source = f'{version_current}_{type_current}.json'
    filename_target = f'embedding_{version_current}_{type_current}.json'
    
    path_file_source = os.path.join(folder_source, filename_source)
    path_file_target = os.path.join(folder_target, filename_target)
    
    samples, contents = load_dataset(path_file_source)
    print(f'{path_file_source} loaded {len(samples)}')
    
    embeddings = [embedder.encode(sample) for sample in samples]
    return embeddings, contents
# end

In [18]:
folder_source = 'data'
folder_target = 'output'
version_base = 202206171000

In [19]:
embedder = SentenceTransformer('distilbert-base-uncased')

No sentence-transformers model found with name /home/jovyan/.cache/torch/sentence_transformers/distilbert-base-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/jovyan/.cache/torch/sentence_transformers/distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
embeddings_train, contents_train = main_embed(embedder, folder_source, folder_target, version_base, 'train')

data/202206171000_train.json loaded 555


In [21]:
embeddings_test, contents_test = main_embed(embedder, folder_source, folder_target, version_base, 'test')

data/202206171000_test.json loaded 84


In [23]:
embeddings_all = embeddings_train + embeddings_test
contents_all = contents_train + contents_test
ids_train = set([content_train['id'] for content_train in contents_train])
ids_test = set([content_test['id'] for content_test in contents_test])

In [155]:
dict_id_embedding = {}
for content, embedding in zip(contents_all, embeddings_all):
    id_content = content['id']
    
    dict_id_embedding[id_content]=embedding
# end

In [117]:
aggregator = DBSCAN(eps=1.12, min_samples=1) # 1.12~1.15 (for 1620)
results_all = aggregator.fit_predict(embeddings_all)

In [139]:
dict_cluster_raw = defaultdict(list)
dict_cluster_ids = defaultdict(set)
dict_id_group = {}
dict_id_content = {}


for result, content in zip(results_all.tolist(), contents_all):
    dict_cluster_raw[result].append(f'[{content["id"]}]: {content["raw"]}')
    dict_cluster_ids[result].add(content['id'])
    
    id_content = content['id']
    dict_id_group[id_content] = result
    dict_id_content[id_content] = content
# end

In [142]:
## this is to display training dataset
# results_all
# dict_id_group[1613]
# print('\n\n\n'.join(dict_cluster_raw[158]))
# print('\n\n\n'.join(dict_cluster_raw[dict_id_group[1690]]))

[1690]: 2022-05-16 14:44:39,016 | Failed at Play [deploy_vm_efi_paravirtual_vmxnet3] *********
2022-05-16 14:44:39,016 | TASK [Try to ping IP] **************************************
task path: /home/worker/workspace/Ansible_RHEL_8.x_MAIN_PARAVIRTUAL_VMXNET3_EFI/ansible-vsphere-gos-validation/common/vm_wait_ping.yml:14
fatal: [localhost]: FAILED! => non-zero return code when ping
2022-05-16 14:45:11,016 | TASK [Testing exit due to failure] *************************
task path: /home/worker/workspace/Ansible_RHEL_8.x_MAIN_PARAVIRTUAL_VMXNET3_EFI/ansible-vsphere-gos-validation/common/test_rescue.yml:55
fatal: [localhost]: FAILED! => Exit testing when 'exit_testing_when_fail' is set to True in test case deploy_vm_efi_paravirtual_vmxnet3


[1691]: 2022-05-17 19:27:19,017 | Failed at Play [deploy_vm_efi_paravirtual_vmxnet3] *********
2022-05-17 19:27:19,017 | TASK [Try to ping IP] **************************************
task path: /home/worker/workspace/Ansible_Cycle_Photon_4.x_Update/ansible-

In [133]:
corpuss_id_embedding_train = list(zip(ids_train, embeddings_train))
corpuss_id_embedding_test = list(zip(ids_test, embeddings_test))

dict_sim_test_train = {}

In [134]:
for corpus_ids_embedding_test in corpuss_id_embedding_test:
    for corpus_id_embedding_train in corpuss_id_embedding_train:
        id_test = corpus_ids_embedding_test[0]
        embedding_test = corpus_ids_embedding_test[1]
        
        id_train = corpus_id_embedding_train[0]
        embedding_train = corpus_id_embedding_train[1]
        
        sim = cosine(embedding_test, embedding_train)
        
        if id_test not in dict_sim_test_train:
            dict_sim_test_train[id_test] = {}
        # end
        
        dict_sim_test_train[id_test][id_train] = sim
    # end
# end

In [151]:
print(dict_id_content[1613]['raw'])

2022-05-17 08:20:47,017 | Failed at Play [deploy_vm_efi_paravirtual_vmxnet3] *********
2022-05-17 08:20:47,017 | TASK [Wait for message 'Autoinstall is completed.' appear in VM log serial-20220517070407.log] 
task path: /home/worker/workspace/Ansible_Cycle_Photon_3.x_Update/ansible-vsphere-gos-validation/common/vm_wait_log_msg.yml:35
fatal: [localhost]: FAILED! => {
    "attempts": 720,
    "censored": "the output has been hidden due to the fact that 'no_log: true' was specified for this result",
    "changed": false
}
2022-05-17 08:21:26,017 | TASK [Testing exit due to failure] *************************
task path: /home/worker/workspace/Ansible_Cycle_Photon_3.x_Update/ansible-vsphere-gos-validation/common/test_rescue.yml:55
fatal: [localhost]: FAILED! => Exit testing when 'exit_testing_when_fail' is set to True in test case deploy_vm_efi_paravirtual_vmxnet3



In [156]:
cosine(dict_id_embedding[1701], dict_id_embedding[1705])

0.987351

In [159]:
sorted([(id_train, sim, dict_id_content[id_train]['target'], dict_id_content[id_train]['raw']) for id_train, sim in dict_sim_test_train[1613].items()], key=lambda item: -item[1])

[(1384,
  0.97300106,
  'targetvm',
  "2021-11-30 08:22:24,030 | Failed at Play [check_ip_address] **************************\n2021-11-30 08:22:24,030 | TASK [Execute powershell command in Windows guest] *********\ntask path: /home/worker/workspace/Ansible_Cycle_Windows_Server_LTSC_vNext/ansible-vsphere-gos-validation/windows/utils/win_execute_cmd.yml:18\nfatal: [localhost]: UNREACHABLE! => basic: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))"),
 (1321,
  0.9651437,
  'targetvm',
  '\n2021-12-01 18:18:32,001 | Failed at Play [lsilogic_vhba_device_ops] ******************\n2021-12-01 18:18:32,001 | TASK [Wait for VMware Tools collecting guest info] *********\ntask path: /home/worker/workspace/Ansible_Regression_RHEL_8.x/ansible-vsphere-gos-validation/common/vm_wait_guest_ip.yml:22\nfatal: [localhost]: FAILED! => hardware configuration table'),
 (1322,
  0.9641511,
  'targetvm',
  '2021-12-01 19:29:30,001 | Failed at Play [nvme_vhba_device_ops] ***********