In [17]:
pip install -U sentence-transformers utils pandas openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [18]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [27]:
import pandas as pd 

df = pd.read_excel(r"cybersecurity_questions_answers.xlsx")
question_answers = df.to_dict(orient='records')

print(question_answers)

[{'Question': 'What is cybersecurity?', 'Answer': 'Cybersecurity is the practice of protecting systems, networks, and programs from digital attacks.'}, {'Question': 'Why is cybersecurity important?', 'Answer': 'Cybersecurity is important because it protects sensitive data from unauthorized access and cyber threats.'}, {'Question': 'What are the common types of cyber threats?', 'Answer': 'Common types of cyber threats include malware, phishing, ransomware, and denial-of-service (DoS) attacks.'}, {'Question': 'What is a firewall?', 'Answer': 'A firewall is a network security device that monitors and filters incoming and outgoing network traffic based on security rules.'}, {'Question': 'What is phishing?', 'Answer': 'Phishing is a type of cyber attack where attackers pose as legitimate entities to steal sensitive information such as login credentials.'}, {'Question': 'What is malware?', 'Answer': 'Malware is malicious software designed to disrupt, damage, or gain unauthorized access to co

In [28]:
qu = pd.read_excel("questions.xlsx") 
questions_to_answer = qu.to_dict(orient='records')
print(questions_to_answer)

[{'Question': 'What is cybersecurity?', 'Answer': nan}, {'Question': 'Why is cybersecurity important?', 'Answer': nan}, {'Question': 'What are the common types of cyber threats?', 'Answer': nan}, {'Question': 'What is a firewall?', 'Answer': nan}, {'Question': 'What is phishing?', 'Answer': nan}, {'Question': 'What is malware?', 'Answer': nan}, {'Question': 'What are the best practices for password management?', 'Answer': nan}, {'Question': 'What is encryption?', 'Answer': nan}, {'Question': 'What is a VPN?', 'Answer': nan}, {'Question': 'How can you protect against ransomware?', 'Answer': nan}]


In [29]:
import torch

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [40]:
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

# Extract sentences from the question_answers
sentences = [item['Question'] for item in question_answers]

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
question_answer_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
question_answer_embeddings = F.normalize(question_answer_embeddings, p=2, dim=1)



In [41]:
from transformers import AutoTokenizer, AutoModel

import torch.nn.functional as F

# Extract input sentences for tokenization
input_questions = [row["Question"] for row in questions_to_answer]

# Tokenize sentences
encoded_questions = tokenizer(input_questions, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    questions_model_output = model(**encoded_questions)

# Perform pooling
question_embeddings = mean_pooling(questions_model_output, encoded_questions['attention_mask'])

# Normalize embeddings
question_embeddings = F.normalize(question_embeddings, p=2, dim=1)

print(question_embeddings)

tensor([[-0.0299,  0.0093, -0.1318,  ..., -0.0320,  0.0615, -0.0100],
        [ 0.0170,  0.0260, -0.0422,  ..., -0.0312,  0.0258, -0.0510],
        [ 0.0315, -0.0462, -0.0111,  ..., -0.0403,  0.0151,  0.0470],
        ...,
        [-0.0396,  0.0358, -0.0932,  ...,  0.0762,  0.0708, -0.0731],
        [-0.0997,  0.0071, -0.0198,  ..., -0.0067,  0.0482,  0.0020],
        [-0.1075,  0.1236, -0.0403,  ..., -0.0833,  0.0895,  0.0155]])


In [49]:
questions_filled = []
questions_filled_similarity_scores = []

for idx, question_embedding in enumerate(question_embeddings):
    similarity_scores = {}
    for idx, question_answer_embedding in enumerate(question_answer_embeddings):
        similarity = pytorch_cos_sim(question_embedding, question_answer_embedding)
        similarity_scores[idx] = similarity.item()

    # sorted_similarity_scores = dict(sorted(similarity_scores.items(), key=lambda item: item[1]))
    sorted_similarity_scores = dict(sorted(similarity_scores.items(), key=lambda item: item[1], reverse=True))
    first_key = next(iter(sorted_similarity_scores))
    questions_filled.append(question_answers[first_key]['Answer'])
    questions_filled_similarity_scores.append(round(sorted_similarity_scores[first_key],3))

In [50]:
# importing the module
import pandas as pd

# creating the DataFrame
questions_with_answers_export = pd.DataFrame({
    'Question': input_questions,
    'Similiarity': questions_filled_similarity_scores,
    'Answer': questions_filled,
})

# determining the name of the file
file_name = 'questions_filled.xlsx'

# saving the excel
questions_with_answers_export.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')


DataFrame is written to Excel File successfully.
