<a href="https://colab.research.google.com/github/gunikagoyal/PRECOG-NLP-TASK/blob/main/Word%20Similarity/Glove_Elmo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**GloVe word embeddings**

In [None]:
# Imports
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm import tqdm

# Load GloVe word embeddings
glove_embeddings_path = "/content/glove.6B.200d.txt"
word_embeddings = {}
with open(glove_embeddings_path, 'r', encoding='utf-8') as file:
    for line in tqdm(file):
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        word_embeddings[word] = vector

# Load word similarity dataset
test_data_path = "/content/SimLex-999.txt"
test_data = pd.read_csv(test_data_path, sep='\t')

# Predict similarity scores using GloVe embeddings
predicted_scores = []
for idx, row in test_data.iterrows():
    word1 = row['word1']
    word2 = row['word2']

    if word1 in word_embeddings and word2 in word_embeddings:
        similarity_score = cosine_similarity([word_embeddings[word1]], [word_embeddings[word2]])[0][0]
    else:
        similarity_score = 0  # Default similarity

    predicted_scores.append(similarity_score)

# Evaluate using Spearman's rank correlation coefficient
true_scores = test_data['SimLex999'].values
correlation = pd.Series(predicted_scores).corr(pd.Series(true_scores), method='spearman')

print("Spearman's Rank Correlation Coefficient using GloVe embeddings:", correlation)


208412it [00:17, 11812.08it/s]


Spearman's Rank Correlation Coefficient using GloVe embeddings: 0.34025352961510563


**Elmo model**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity

# Load ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Load word similarity dataset
test_data_path = "/content/SimLex-999.txt"
test_data = pd.read_csv(test_data_path, sep='\t')

# Function to obtain ELMo embeddings for a sentence
def elmo_embeddings(sentences):
    embeddings = elmo.signatures["default"](tf.convert_to_tensor(sentences))["elmo"]
    return embeddings.numpy()

# Tokenize sentences and obtain ELMo embeddings
elmo_embeddings_list = []
for _, row in test_data.iterrows():
    word1 = row['word1']
    word2 = row['word2']

    # Convert word pairs to sentences
    sentence1 = word1 + ' .'
    sentence2 = word2 + ' .'

    # Obtain ELMo embeddings for the sentences
    embeddings = elmo_embeddings([sentence1, sentence2])
    embedding1 = embeddings[0].reshape(1, -1)
    embedding2 = embeddings[1].reshape(1, -1)

    # Compute cosine similarity between ELMo embeddings
    similarity_score = cosine_similarity(embedding1, embedding2)[0][0]
    elmo_embeddings_list.append(similarity_score)

# Evaluate using Spearman's rank correlation coefficient
true_scores = test_data['SimLex999'].values
correlation = pd.Series(elmo_embeddings_list).corr(pd.Series(true_scores), method='spearman')

print("Spearman's Rank Correlation Coefficient using ELMo embeddings:", correlation)


Spearman's Rank Correlation Coefficient using ELMo embeddings: 0.43267354384569995
