In [None]:
# Cosine Similarity – Text Similarity Metric

In [None]:
# Text Similarity has to determine how the two text documents close to each other in terms of their context or meaning.

# The Cosine similarity of two documents will range from 0 to 1. If the Cosine similarity score is 1, it means two vectors
# have the same orientation. The value closer to 0 indicates that the two documents have less similarity.

# cos(theta) = similarity(A,B)= A⋅B/∥A∥∥B∥

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Convert text to lower case
doc_1 = "Data is the oil of the digital economy".lower()
doc_2 = "Data is a new oil".lower()

# Step 2: Tokenize and vectorize using CountVectorizer
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform([doc_1, doc_2])

# Step 3: Calculate cosine similarity
cosine_sim = cosine_similarity(vectors[0], vectors[1])

# Output the cosine similarity
print("Cosine Similarity:", cosine_sim[0][0])


Cosine Similarity: 0.4743416490252569


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Convert text to lower case (optional if CountVectorizer is case-insensitive)
docs = [
    "Data is the oil of the digital economy",
    "Data is a new oil",
    "Big data is transforming business",
    "The new oil in the digital economy is data"
]

# Convert documents to lowercase
docs = [doc.lower() for doc in docs]

# Step 2: Tokenize and vectorize using CountVectorizer
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(docs)
# overview of unique tokens
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Unique Words):", feature_names)
#overview of bag of words
print("\nBag-of-Words Matrix:")
print(vectors.toarray())

# Step 3: Calculate cosine similarity
cosine_sim_matrix = cosine_similarity(vectors)

# Output the cosine similarity matrix
print("Cosine Similarity Matrix:\n", cosine_sim_matrix)


Feature Names (Unique Words): ['big' 'business' 'data' 'digital' 'economy' 'in' 'is' 'new' 'of' 'oil'
 'the' 'transforming']

Bag-of-Words Matrix:
[[0 0 1 1 1 0 1 0 1 1 2 0]
 [0 0 1 0 0 0 1 1 0 1 0 0]
 [1 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 1 1 1 1 1 1 0 1 2 0]]
Cosine Similarity Matrix:
 [[1.         0.47434165 0.28284271 0.85811633]
 [0.47434165 1.         0.4472136  0.60302269]
 [0.28284271 0.4472136  1.         0.26967994]
 [0.85811633 0.60302269 0.26967994 1.        ]]


In [None]:
# Semantic Similarity

In [None]:
# Dimensionality reduction techniques like TruncatedSVD (Singular Value Decomposition) can
#  help capture underlying semantic relationships in the text by reducing the number of features
# (dimensions) while preserving the most important information. This process is often used to improve the performance of
# similarity measures by focusing on the most significant components.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Convert text to lower case (optional if CountVectorizer is case-insensitive)
docs = [
    "Data is the oil of the digital economy",
    "Data is a new oil",
    "Big data is transforming business",
    "The new oil in the digital economy is data"
]

# Convert documents to lowercase
docs = [doc.lower() for doc in docs]

# Step 2: Tokenize and vectorize using TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(docs)

# Step 3: Dimensionality reduction using TruncatedSVD (optional but helps in capturing semantic relationships)
svd = TruncatedSVD(n_components=2)
reduced_vectors = svd.fit_transform(tfidf_vectors)

# Step 4: Calculate cosine similarity
cosine_sim_matrix = cosine_similarity(reduced_vectors)

# Output the cosine similarity matrix
print("Cosine Similarity Matrix:\n", cosine_sim_matrix)


Cosine Similarity Matrix:
 [[1.         0.79242453 0.02266794 0.99745732]
 [0.79242453 1.         0.62777587 0.83387996]
 [0.02266794 0.62777587 1.         0.0938583 ]
 [0.99745732 0.83387996 0.0938583  1.        ]]


In [None]:
# Document 1 and Document 2 have a high similarity score of 0.97324292, indicating that they share similar semantic content.
# Document 3 and Document 4 have a lower similarity score of 0.77645213, indicating that they are less similar compared to other pairs.