<a href="https://colab.research.google.com/github/dineshrayella/NLP/blob/main/Lab7_3_DINESH_2403A54099.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
documents = [
    "Election results announced.",
    "Government passed bill.",
    "Prime minister spoke.",
    "Parties started campaign.",
    "Parliament session begins.",
    "Opposition criticized policy.",
    "Citizens cast votes.",
    "Cabinet approved reform.",
    "President addressed nation.",
    "Polling dates announced.",
    "Ruling party won.",
    "Debate on security.",
    "Lawmakers discussed budget.",
    "New tax introduced.",
    "Rally attracted supporters.",
    "Minister resigned today.",
    "Constitution guarantees rights.",
    "Senate passed bill.",
    "Foreign policy discussed.",
    "Campaign focused development."
]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# The 'documents' list is already defined from a previous cell.
# No need to redefine it here using D1, D2, D3, D4.

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words="english")

# Fit and transform the documents
bow = vectorizer.fit_transform(documents)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert to DataFrame for better readability
df_bow = pd.DataFrame(bow.toarray(), columns=feature_names)

print("Bag of Words (TF-IDF) Representation:")
print(df_bow)

Bag of Words (TF-IDF) Representation:
    addressed  announced  approved  attracted  begins  budget  cabinet  \
0           0          1         0          0       0       0        0   
1           0          0         0          0       0       0        0   
2           0          0         0          0       0       0        0   
3           0          0         0          0       0       0        0   
4           0          0         0          0       1       0        0   
5           0          0         0          0       0       0        0   
6           0          0         0          0       0       0        0   
7           0          0         1          0       0       0        1   
8           1          0         0          0       0       0        0   
9           0          1         0          0       0       0        0   
10          0          0         0          0       0       0        0   
11          0          0         0          0       0       0        0   


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Calculate cosine similarity between documents
cosine_sim_matrix = cosine_similarity(df_bow)

# Create labels for the documents (e.g., D1, D2, ..., D20)
document_labels = [f'D{i+1}' for i in range(len(documents))]

# Convert to DataFrame for better readability
df_cosine_sim = pd.DataFrame(cosine_sim_matrix, index=document_labels, columns=document_labels)

print("Cosine Similarity Matrix:")
print(df_cosine_sim)

Cosine Similarity Matrix:
           D1   D2        D3        D4   D5        D6   D7   D8   D9  \
D1   1.000000  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0   
D2   0.000000  1.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0   
D3   0.000000  0.0  1.000000  0.000000  0.0  0.000000  0.0  0.0  0.0   
D4   0.000000  0.0  0.000000  1.000000  0.0  0.000000  0.0  0.0  0.0   
D5   0.000000  0.0  0.000000  0.000000  1.0  0.000000  0.0  0.0  0.0   
D6   0.000000  0.0  0.000000  0.000000  0.0  1.000000  0.0  0.0  0.0   
D7   0.000000  0.0  0.000000  0.000000  0.0  0.000000  1.0  0.0  0.0   
D8   0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.0  1.0  0.0   
D9   0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  1.0   
D10  0.333333  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0   
D11  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0   
D12  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0   
D13  0.000000  0.0  0.000000  0.000000

In [4]:
from sklearn.metrics import jaccard_score
import numpy as np
import pandas as pd

# Convert BOW matrix to binary (presence/absence) for Jaccard similarity
binary_bow = (df_bow > 0).astype(int)

# Calculate Jaccard Similarity
num_documents = binary_bow.shape[0]
jaccard_sim_matrix = np.zeros((num_documents, num_documents))

for i in range(num_documents):
    for j in range(num_documents):
        # For Jaccard similarity, we can use the jaccard_score which computes 1 - Jaccard distance
        # However, jaccard_score expects 1D arrays for binary classification tasks.
        # To compute similarity between two sets represented as binary vectors,
        # we can calculate (intersection_size / union_size)

        # Get the binary vectors for documents i and j
        vec_i = binary_bow.iloc[i]
        vec_j = binary_bow.iloc[j]

        # Calculate intersection and union
        intersection = np.sum(np.logical_and(vec_i, vec_j))
        union = np.sum(np.logical_or(vec_i, vec_j))
        if union == 0:
            jaccard_sim_matrix[i, j] = 0.0 # If both sets are empty, similarity is 0
        else:
            jaccard_sim_matrix[i, j] = intersection / union

# Convert to DataFrame for better readability
df_jaccard_sim = pd.DataFrame(jaccard_sim_matrix, index=documents, columns=documents)

print("Jaccard Similarity Matrix:")
print(df_jaccard_sim)

Jaccard Similarity Matrix:
                                 Election results announced.  \
Election results announced.                              1.0   
Government passed bill.                                  0.0   
Prime minister spoke.                                    0.0   
Parties started campaign.                                0.0   
Parliament session begins.                               0.0   
Opposition criticized policy.                            0.0   
Citizens cast votes.                                     0.0   
Cabinet approved reform.                                 0.0   
President addressed nation.                              0.0   
Polling dates announced.                                 0.2   
Ruling party won.                                        0.0   
Debate on security.                                      0.0   
Lawmakers discussed budget.                              0.0   
New tax introduced.                                      0.0   
Rally attract

In [5]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')
def document_wordnet_similarity(doc1, doc2):
    tokens1 = word_tokenize(doc1.lower())
    tokens2 = word_tokenize(doc2.lower())

    synsets1 = [s for token in tokens1 for s in wordnet.synsets(token)]
    synsets2 = [s for token in tokens2 for s in wordnet.synsets(token)]

    if not synsets1 or not synsets2:
        return 0.0

    # Calculate pairwise similarity and take the maximum for each pair
    max_similarities = []
    for s1 in synsets1:
        max_sim_for_s1 = 0.0
        for s2 in synsets2:
            sim = s1.path_similarity(s2)
            if sim is not None and sim > max_sim_for_s1:
                max_sim_for_s1 = sim
        max_similarities.append(max_sim_for_s1)

    # Average the maximum similarities (simple approach)
    if max_similarities:
        return np.mean(max_similarities)
    else:
        return 0.0
# Prepare documents list
# The 'documents' list is already defined globally. No need to redefine it.

# Calculate WordNet similarity matrix
num_documents = len(documents)
wordnet_sim_matrix = np.zeros((num_documents, num_documents))

for i in range(num_documents):
    for j in range(num_documents):
        if i == j:
            wordnet_sim_matrix[i, j] = 1.0
        else:
            wordnet_sim_matrix[i, j] = document_wordnet_similarity(documents[i], documents[j])

df_wordnet_sim = pd.DataFrame(wordnet_sim_matrix, index=documents, columns=documents)

print("WordNet Similarity Matrix (Path Similarity):")
print(df_wordnet_sim)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


WordNet Similarity Matrix (Path Similarity):
                                 Election results announced.  \
Election results announced.                         1.000000   
Government passed bill.                             0.200163   
Prime minister spoke.                               0.197888   
Parties started campaign.                           0.218342   
Parliament session begins.                          0.213562   
Opposition criticized policy.                       0.149573   
Citizens cast votes.                                0.182783   
Cabinet approved reform.                            0.161558   
President addressed nation.                         0.159737   
Polling dates announced.                            0.346627   
Ruling party won.                                   0.209560   
Debate on security.                                 0.181530   
Lawmakers discussed budget.                         0.128577   
New tax introduced.                                 0.27098