In [5]:
# Text processing libraries
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure you have the necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Numerical computing libraries
import numpy as np

# Machine learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Advanced NLP models
from sentence_transformers import SentenceTransformer

# Optionally, if using fuzzy matching


# Sample text database
text_database = [
    "The Eiffel Tower is located in Paris, France.",
    "The moon orbits the Earth every 27.3 days.",
    "COVID-19 vaccines are effective in preventing severe illness.",
    "Python is a popular programming language for data science.",
    "Climate change is impacting global weather patterns."
]

# Article to check
article = """
The Eiffel Tower, one of the most famous landmarks in the world, stands tall in the city of Paris.
Millions of tourists visit this iconic structure every year. Additionally, advancements in vaccines have shown
significant effectiveness in reducing severe cases of COVID-19.
"""


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lukag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lukag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

# Preprocess the database entries
processed_database = [preprocess_text(entry) for entry in text_database]

# Preprocess the article
processed_article = preprocess_text(article)


In [None]:
# Combine all texts for vectorization
all_texts = processed_database + [processed_article]

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Compute cosine similarity between the article and each database entry
article_vector = tfidf_matrix[-1]  # The article is the last item
database_vectors = tfidf_matrix[:-1]  # All entries except the last

# Calculate similarity scores
cosine_similarities = cosine_similarity(article_vector, database_vectors).flatten()

# Output similarity scores
print("TF-IDF Cosine Similarity Scores:")
for idx, score in enumerate(cosine_similarities):
    print(f"Entry {idx + 1}: {score:.4f}")


In [None]:
# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the texts
database_embeddings = model.encode(text_database)
article_embedding = model.encode(article)

# Compute cosine similarities
semantic_similarities = cosine_similarity(
    [article_embedding], database_embeddings
).flatten()

# Output similarity scores
print("\nSemantic Similarity Scores:")
for idx, score in enumerate(semantic_similarities):
    print(f"Entry {idx + 1}: {score:.4f}")
