In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stopwords.words('english')]
    return ' '.join(tokens)

# Example document
document = """
**Confidential Memo**

---

To: All Department Heads

From: Office of Administration

Date: April 26, 2024

Subject: Implementation of New Security Protocol

---

Dear Department Heads,

We are writing to inform you of the upcoming implementation of a new security protocol aimed at enhancing the safety and confidentiality of our organization's sensitive information. This protocol will be effective starting May 1, 2024.

Key Points of the Protocol:

1. **Biometric Access Control**: Access to designated secure areas will now require biometric authentication in addition to traditional keycard access. Please ensure that all employees requiring access to these areas are registered in the biometric system by April 30, 2024.

2. **Encryption Policy**: All electronic communication containing sensitive information must be encrypted using the latest encryption standards. This includes emails, file transfers, and instant messages. Failure to comply may result in disciplinary action.

3. **Visitor Protocol**: A stricter visitor registration process will be implemented. All visitors must be pre-approved and escorted at all times while on the premises. Visitor access will be restricted to designated areas only.

4. **Security Awareness Training**: Mandatory security awareness training sessions will be conducted for all employees. These sessions will cover topics such as recognizing phishing attempts, password security, and physical security best practices.

5. **Incident Reporting**: Any security incidents or breaches must be reported immediately to the IT Security team. Prompt reporting is crucial for mitigating potential risks and minimizing the impact of security incidents.

Please disseminate this information to all staff members within your respective departments and ensure full compliance with the new security protocol. We appreciate your cooperation in maintaining the security and integrity of our organization's operations.

Should you have any questions or require further clarification, please do not hesitate to contact the Office of Administration.

Sincerely,

[Signature]

Office of Administration
"""
def getKeywords(doc):
    # Preprocess the document
    preprocessed_doc = preprocess_text(doc)

    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_doc])

    # Get feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Get TF-IDF scores for each word
    tfidf_scores = tfidf_matrix.toarray()[0]

    # Sort words by their TF-IDF scores
    keywords = [(feature_names[i], tfidf_scores[i]) for i in tfidf_scores.argsort()[::-1]]

    # Print top keywords
    return keywords

keywords = getKeywords(document)

top_keywords = 10
for keyword, score in keywords[:top_keywords]:
    print(keyword, "-", score)

[nltk_data] Downloading package punkt to C:\Users\STORM
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\STORM
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\STORM
[nltk_data]     Tech\AppData\Roaming\nltk_data...


security - 0.5244044240850758
protocol - 0.28603877677367767
access - 0.2383656473113981
visitor - 0.19069251784911848
biometric - 0.14301938838683884
information - 0.14301938838683884
must - 0.14301938838683884
new - 0.14301938838683884
office - 0.14301938838683884
please - 0.14301938838683884


In [12]:
import spacy
import numpy as np
from numpy.linalg import norm

# Load the spaCy model with word embeddings
nlp = spacy.load("en_core_web_lg")

def find_matching_tags(keywords, tags):
    tag_scores = {tag: 0 for tag in tags}

    # Calculate relevance score for each tag based on semantic similarity
    for keyword, score in keywords:
        keyword_embedding = nlp(keyword).vector
        for tag in tags:
            tag_embedding = nlp(tag).vector
            similarity_score = keyword_embedding.dot(tag_embedding) / (norm(keyword_embedding) * norm(tag_embedding))
            tag_scores[tag] += similarity_score * score
    
    # Sort tags by relevance score
    sorted_tags = sorted(tag_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_tags

# Example usage:
tags = ["mail", "secure", "Digital", "banana"]

matching_tags = find_matching_tags(keywords, tags)
print(matching_tags)


[('secure', 3.0043031950763393), ('Digital', 1.7573857407628375), ('mail', 1.4642101860308727), ('banana', -0.10331752296194005)]
