<a href="https://colab.research.google.com/github/explrA/Precog-Tasks/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Task 1.1:**
Semantic Similarity Using Unlabeled Training Data (Using Monolingual English Corpus)

In [1]:
!pip install nltk
!pip install pandas



In [2]:
import nltk
import re
import pandas as pd
import sklearn

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [26]:
def data_preprocessing(tr_data):
  with open(tr_data, "r") as f:
    f_read = f.read()
  f_lower = f_read.lower()

  stop_words = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()

  w = f_lower.split()
  filtered_tokens = [word for word in w if word not in stop_words]
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in filtered_tokens]
  lemmatized_text = ' '.join(lemmatized_tokens)

  clean_text = [re.sub("[^a-z]", " ", str(lemmatized_text))]

  return clean_text


def get_test_vec(word, v_dict):
  if word in v_dict:
    return v_dict[word]
  else:
    return None

doc1 = data_preprocessing('/content/train_a.txt')
doc2 = data_preprocessing('/content/train_b.txt')

train_docs = [doc1, doc2]
# print(train_docs)

flat_train_docs = []
for i in train_docs:
  flat_train_docs.append(i[0])
# print(flat_train_docs)

vec = TfidfVectorizer(lowercase=False)
transfrmd_vec = vec.fit_transform(flat_train_docs).toarray()
# print(transfrmd_vec)
print(transfrmd_vec.shape)

vocab = vec.get_feature_names_out()
# print(vocab)
# print(vec.vocabulary_)
vocab_dict = {v: transfrmd_vec[:, n] for n,v in enumerate(vocab)}
frst = next(iter(vocab_dict.items()))
# print(frst)


read_data = pd.read_csv('/content/test_data.txt', sep='\t',encoding='UTF-8')
df = pd.DataFrame(read_data)
df_words = df[df.columns[0:2]]


semantic_sim = []

for i in range(len(df_words.index)):
  word1 = df_words.iloc[i, 0]
  vec1 = get_test_vec(word1, vocab_dict)

  word2 = df_words.iloc[i, 1]
  vec2 = get_test_vec(word2, vocab_dict)

  if vec1 is not None and vec2 is not None:
    sim = cosine_similarity([vec1], [vec2])[0][0]
    print(word1, word2, sim)
    semantic_sim.append(sim)
  else:
    print("oov")

(2, 41578)
old new 0.9911710777669999
smart intelligent 0.8994876211638214
hard difficult 0.9995093838429554
happy cheerful 0.9981230998000451
hard easy 0.9885066822960615
fast rapid 0.9705676694014539
happy glad 0.984661628605691
short long 0.9959299973586289
stupid dumb 0.9861936216721012
weird strange 0.998997175427375
wide narrow 0.9316841609494746
bad awful 0.9936424688671894
easy difficult 0.9832867169779113
bad terrible 0.9448146878094205
hard simple 0.956181406051893
smart dumb 0.9975169111189717
insane crazy 0.8195527614844813
happy mad 0.7188240521432995
large huge 0.9944183414950505
hard tough 0.9808848294426507
new fresh 0.9991312005432449
sharp dull 0.9760976243736185
quick rapid 0.9947993461094456
dumb foolish 0.7943636673735341
wonderful terrific 0.9858547194650364
strange odd 0.9711478737436138
happy angry 0.9952706209355672
narrow broad 0.9913562956237036
simple easy 0.9894525864233348
old fresh 0.9958356647750597
apparent obvious 0.9854018423450213
inexpensive cheap 0

In [27]:
ground_truth_values = df['SimLex999'].tolist()

simlex_threshold = 5
cosine_sim_threshold = 0.5

ground_truth_labels = [1 if val > simlex_threshold else 0 for val in ground_truth_values]
cosine_sim_labels = [1 if val > cosine_sim_threshold else 0 for val in semantic_sim]

preds = sum(1 for g, c in zip(ground_truth_labels, cosine_sim_labels) if g == c)
acc = (preds / len(ground_truth_values))
print(acc)

0.44344344344344344


### **Task 1.2:**
Semantic Similarity Using Pre-Trained Model (Word2Vec Pre-Trained On Google News)

In [4]:
!pip install gensim
!pip install pandas
!pip install nltk



In [5]:
import pandas as pd
import sklearn
import numpy as np

from gensim.downloader import load
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
w2v_model = load('word2vec-google-news-300')



In [7]:
def get_word_vec(word):
  try:
    return w2v_model[word]

  except KeyError:
    return [0] * w2v_model.vector_size


In [21]:
read_data = pd.read_csv('/content/test_data.txt', sep='\t',encoding='UTF-8')
df = pd.DataFrame(read_data)
df_words = df[df.columns[0:2]]

semantic_sim = []

for i in range(len(df_words.index)):
  word1 = df_words.iloc[i, 0]
  vec1 = get_word_vec(word1)

  word2 = df_words.iloc[i, 1]
  vec2 = get_word_vec(word2)

  if (np.all(vec1!= 0)) and (np.all(vec2 != 0)):
    sim = cosine_similarity([vec1], [vec2])[0][0]
    semantic_sim.append(sim)
    print(word1,word2,sim)
  else:
    print("oov")

old new 0.22278029
smart intelligent 0.6495278
hard difficult 0.6025749
happy cheerful 0.38377386
hard easy 0.47096327
fast rapid 0.47668296
happy glad 0.74088925
short long 0.57684326
stupid dumb 0.81731385
weird strange 0.8164579
wide narrow 0.45760995
bad awful 0.5527253
easy difficult 0.5890916
bad terrible 0.68286115
hard simple 0.2591452
smart dumb 0.5792695
insane crazy 0.73390436
happy mad 0.39202875
large huge 0.65891665
hard tough 0.63428825
new fresh 0.4445604
sharp dull 0.3003645
quick rapid 0.49779084
dumb foolish 0.6149927
wonderful terrific 0.7420832
strange odd 0.73074275
happy angry 0.37493232
narrow broad 0.4575769
simple easy 0.59903
old fresh 0.1422162
apparent obvious 0.70720327
inexpensive cheap 0.7009896
nice generous 0.3340648
weird normal 0.27713367
weird odd 0.6381519
bad immoral 0.32526675
sad funny 0.48797843
wonderful great 0.76478696
guilty ashamed 0.4027084
beautiful wonderful 0.68540865
confident sure 0.47928122
dumb dense 0.1498453
large big 0.5561479
n

In [24]:
ground_truth_values = df['SimLex999'].tolist()

simlex_threshold = 5
cosine_sim_threshold = 0.5

ground_truth_labels = [1 if val > simlex_threshold else 0 for val in ground_truth_values]
cosine_sim_labels = [1 if val > cosine_sim_threshold else 0 for val in semantic_sim]

preds = sum(1 for g, c in zip(ground_truth_labels, cosine_sim_labels) if g == c)
acc = (preds / len(ground_truth_values))
print(acc)

0.6306306306306306
