# Cosine similarity

In [None]:
Import numpy as np
and import pandas as pd
from sklearn. metrics.pairwise import cosine_similarity
# Define the documents and their contents
documents = {
    ‘Document 1’: “Political unrest leads to protests.”
    ‘Document 2’: “New technology breakthrough announced.”
    ‘Document 3’: “Team wins championship in a thrilling sports event.”
    ‘Document 4’: “Popular actor’s new movie release.”
    ‘Document 5’: “Stock market experiences sharp rise.”
}
# Create vocabulary from the documents
vocabulary = set()
for doc_content in documents.values():
    vocabulary.update(doc_content.lower().split())
# Create BoW vectors for each document
bow_vectors = []
for doc_content in documents.values():
    bow_vector = [doc_content.lower().count(word) for word in vocabulary]
    bow_vectors.append(bow_vector)
print("Document Bow vectors are", bow_vectors)
# Convert BoW vectors to DataFrame
bow_df = pd.DataFrame(bow_vectors, columns=vocabulary, index=documents.keys())
print("Document Bow Dataframe",bow_df)
# Define user interests
user_interests = {
    'User 1': {'politics'},
    'User 2': {'technology', 'sports'},
    ‘User 3’: {‘entertainment’, ‘finance’}
}
# Create user profiles as BoW vectors
user_profiles = {}
for the user, interests in user_interests.items():
    user_profile = [1 if word in interests else 0 for word in vocabulary]
    user_profiles[user] = user_profile
print("User profile bow vectors",user_profiles)
# Convert user profiles to DataFrame
user_profiles_df = pd.DataFrame(user_profiles, index=vocabulary).T
print("user_profiles Dataframe",user_profiles_df)
# Calculate cosine similarity between user profiles and documents
similarities = cosine_similarity(user_profiles_df.values, bow_df.values)
# Create DataFrame for similarity scores
similarity_df = pd.DataFrame(similarities, index=user_profiles_df.index, columns=bow_df.index)
# Recommend articles based on highest similarity scores
recommendations = {}
for user, row in similarity_df.iterrows():
    recommendations[user] = similarity_df.columns[row.argmax()]
print("Recommendations:")
for user, article in recommendations.items():
    print(f"{user}: {article}")

# Jaccab similarity

In [None]:
import numpy as np
from sklearn.metrics import jaccard_score
# Define the Bag of Words (BoW) vectors of documents
bow_vectors = {
    'Document 1': [1, 1, 1, 0, 0, 0, 0, 0],  # BoW vector for Document 1 (Politics)
    'Document 2': [0, 1, 0, 1, 1, 0, 0, 0],  # BoW vector for Document 2 (Technology)
    'Document 3': [0, 0, 1, 1, 0, 1, 0, 0],  # BoW vector for Document 3 (Sports)
    'Document 4': [0, 0, 0, 0, 0, 1, 1, 1],  # BoW vector for Document 4 (Entertainment)
    'Document 5': [0, 0, 0, 0, 1, 0, 1, 1],  # BoW vector for Document 5 (Finance)
}
# Convert BoW vectors to binary arrays
binary_arrays = np.array(list(bow_vectors.values()))
# Calculate Jaccard similarity between pairs of documents using built-in function
jaccard_scores = {}
for i, (doc1, bow1) in enumerate(bow_vectors.items()):
    for j, (doc2, bow2) in enumerate(bow_vectors.items()):
        if i != j:  # Exclude comparing a document with itself
            jaccard_scores[(doc1, doc2)] = jaccard_score(bow1, bow2)
# Print Jaccard similarity scores
print("Jaccard Similarity Scores:")
for pair, score in jaccard_scores.items():
    print(f"{pair}: {score}")

# Euclidean distance

In [None]:
import numpy as np
# Define the Bag of Words (BoW) vectors of documents
bow_vectors = {
    'Document 1': [1, 1, 1, 0, 0, 0, 0, 0],  # BoW vector for Document 1 (Politics)
    'Document 2': [0, 1, 0, 1, 1, 0, 0, 0],  # BoW vector for Document 2 (Technology)
    'Document 3': [0, 0, 1, 1, 0, 1, 0, 0],  # BoW vector for Document 3 (Sports)
    'Document 4': [0, 0, 0, 0, 0, 1, 1, 1],  # BoW vector for Document 4 (Entertainment)
    'Document 5': [0, 0, 0, 0, 1, 0, 1, 1],  # BoW vector for Document 5 (Finance)
}
# Convert BoW vectors to numpy arrays
bow_arrays = np.array(list(bow_vectors.values()))
# Calculate Euclidean distance between pairs of documents
euclidean_distances = {}
for i, (doc1, bow1) in enumerate(bow_vectors.items()):
    for j, (doc2, bow2) in enumerate(bow_vectors.items()):
        if i != j:  # Exclude comparing a document with itself
            euclidean_distances[(doc1, doc2)] = np.linalg.norm(bow1 - bow2)
# Print Euclidean distances
print("Euclidean Distances:")
for pair, distance in euclidean_distances.items():
    print(f"{pair}: {distance}")