# Word2Vec

## Building the Vocabulary

### Imports

In [1]:
import xlrd
import pandas as pd 
import sys
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
import gensim.downloader as api
from nltk.tokenize import RegexpTokenizer
import numpy as np


from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /content/drive


In [2]:
f1 = ("/content/drive/MyDrive/Colab Notebooks/Data/assignment3_data.xlsx")
wb = xlrd.open_workbook(f1)
sheet_data = wb.sheet_by_index(0)

PM_qual = []

for i in range(1, sheet_data.nrows):
  temp_txt = sheet_data.cell_value(i, 0).lower()
  PM_qual.append(temp_txt)

len(PM_qual)

38

### Scoring

In [3]:
f1 = ("/content/drive/MyDrive/Colab Notebooks/Data/assignment3_gold_labels.xlsx")
wb = xlrd.open_workbook(f1)
data = wb.sheet_by_index(0)

actual_data = []

for i in range(2, data.nrows):
  temp_txt = data.cell_value(i, 0).lower()
  actual_data.append(temp_txt)

len(actual_data)

10

In [4]:
tokenizer = RegexpTokenizer("[\w']+")

actual_words = []
for i in range(len(actual_data)):
  text = actual_data[i]
  tokens = tokenizer.tokenize(text)
  for j in tokens:
    syn = wn.synsets(j)

    if syn!=[] and j not in stopwords.words('english'):
      if j not in actual_words:
          actual_words.append(j)

In [5]:
best_score = 0
for i in actual_words:
  for j in actual_words:
    syn1 = wordnet.synsets(i)[0]
    syn2 = wordnet.synsets(j)[0]
    temp_sim = syn1.wup_similarity(syn2)
    if temp_sim == None:
      temp_sim = 0
    best_score = best_score + temp_sim
best_score = best_score/(len(actual_words)**2)

In [6]:
def score(predicted_words):
  pred_score = 0
  for i in actual_words:
    for j in predicted_words:
      syn1 = wordnet.synsets(i)[0]
      syn2 = wordnet.synsets(j)[0]
      temp_sim = syn1.wup_similarity(syn2)
      if temp_sim == None:
        temp_sim = 0
      pred_score = pred_score + temp_sim
  return (pred_score/(len(actual_words)*len(predicted_words)))/best_score

### Pre-processing

Could have added lemmatization here, but did not affect the dataset significantly which shows that almost all words were different.

In [7]:
data = []
# iterate through each sentence in the file
for sentence in PM_qual:
    words = []    
    for word in tokenizer.tokenize(sentence):
      if word not in stopwords.words('english'):  # tokenize the sentence into words
        words.append(word.lower()) 
    data.append(words)

In [8]:
phrase =[]

for sentence in data:
  non_words = []
  for i in range(len(sentence)-1):
    t =  str(sentence[i]+'_'+sentence[i+ 1])
    if wn.synsets(t) and t not in sentence:
      sentence.append(t)
      non_words.append(sentence[i])
      non_words.append(sentence[i+1])
  for j in non_words:
    sentence.remove(j)

### Defining Functions for KMeans Trial

In [9]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

https://dylancastillo.co/nlp-snippets-cluster-documents-using-word2vec/

In [10]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [11]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

## Model A

In [12]:
model1 = gensim.models.Word2Vec(data, min_count = 1, window = 30, sg = 0)

### Experiment A1 - simply associating words related to the word "qualities"

In [13]:
simple_list = list(model1.wv.most_similar('qualities', topn=10))

In [14]:
simple_list

[('kindness', 0.22251810133457184),
 ('understanding', 0.20999625325202942),
 ('polite', 0.1904253363609314),
 ('exceptionally', 0.18115949630737305),
 ('systems', 0.17610587179660797),
 ('compassion', 0.16027334332466125),
 ('president', 0.15612711012363434),
 ('course', 0.15486463904380798),
 ('people', 0.15219995379447937),
 ('management', 0.14876441657543182)]

In [None]:
predicted_words = []
for i in range(10):
  predicted_words.append(simple_list[i][0])
score(predicted_words)

0.6852536130699876

### Experiment A2 - KMeans

In [None]:
X = model1[model1.wv.vocab]
vectorized_docs = vectorize(data, model=model1)

clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=7,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    #"text": docs,
    "tokens": [" ".join(text) for text in data]
    #"cluster": cluster_labels
})

For n_clusters = 7
Silhouette coefficient: 0.07
Inertia:0.0061209110723436716
Silhouette values:
    Cluster 2: Size:2 | Avg:0.45 | Min:0.45 | Max: 0.45
    Cluster 5: Size:2 | Avg:0.12 | Min:0.01 | Max: 0.22
    Cluster 3: Size:29 | Avg:0.05 | Min:-0.12 | Max: 0.18
    Cluster 0: Size:1 | Avg:0.00 | Min:0.00 | Max: 0.00
    Cluster 1: Size:1 | Avg:0.00 | Min:0.00 | Max: 0.00
    Cluster 6: Size:1 | Avg:0.00 | Min:0.00 | Max: 0.00
    Cluster 4: Size:2 | Avg:-0.01 | Min:-0.05 | Max: 0.03


  """Entry point for launching an IPython kernel.


In [None]:
print("Most representative terms per cluster (based on centroids):")
tokens_per_cluster = ""
most_representative = model1.wv.most_similar(positive=[clustering.cluster_centers_[1]], topn=10)
for t in most_representative:
  tokens_per_cluster += f"{t[0]} "
print(f"Cluster {1}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 1: good character born run mature leader idiot equality prejudice bravery 


In [None]:
predicted_words = []
model_1_words = tokenizer.tokenize(tokens_per_cluster)
for i in range(10):
  predicted_words.append(model_1_words[0])
score(predicted_words)

0.9631362409433815

## Model B

In [None]:
model2 = api.load("glove-wiki-gigaword-100")



### Expermient B1 - Simply associating words related to the word "quality"

In [None]:
simple_list = list(model2.wv.most_similar('qualities', topn=100))

pred_list = []
for i in simple_list:
  for sentence in data:
    for word in sentence:
      if i[0] == word and i[0] not in pred_list:
        pred_list.append(i[0])

pred_list = pred_list[:10]

  """Entry point for launching an IPython kernel.


In [None]:
pred_list

['skills',
 'ability',
 'experience',
 'empathy',
 'honesty',
 'passion',
 'humility',
 'bravery',
 'knowledge',
 'determination']

In [None]:
predicted_words = []
for i in range(10):
  predicted_words.append(pred_list[i])
score(predicted_words)

1.1945283950116705

### Experiment B2 - similarity b/w ques data and vocab words

In [None]:
ques = sheet_data.cell_value(0, 0).lower()

In [None]:
ques_tokens = []

for i in tokenizer.tokenize(ques):
  syn = wn.synsets(i)
  if i not in stopwords.words('english') and syn[0].pos() in ['n','a','s']:
    ques_tokens.append(i)

for i in range(len(ques_tokens)-1):
  t = str(ques_tokens[i]+'_'+ques_tokens[i+1])
  if wn.synsets(t):
    ques_tokens.append(t)

In [None]:
word_list = []
for sentence in data:
  for word in sentence:
    syn = wn.synsets(word)
    if syn and word not in word_list and word not in stopwords.words('english') and syn[0].pos() in ['n','a','s']:
      word_list.append(word)

In [None]:
sim_score_list = []
for word in word_list:
    t = 0
    for token in ques_tokens:
      if word in model2.vocab and token in model2.vocab:
        t = t + model2.wv.similarity(w1 = word, w2 = token)
    sim_score_list.append([word,t/len(ques_tokens)])

sorted_sim_scores = sorted(sim_score_list, key = lambda x: x[1])
sorted_sim_scores.reverse()

  


In [None]:
predicted_words = []
for i in range(10):
  predicted_words.append(sorted_sim_scores[i][0])

score(predicted_words)

0.8388161606401232

In [None]:
pd.DataFrame(sorted_sim_scores, columns =['word', 'sim score']).head(10)

Unnamed: 0,word,sim score
0,well,0.444672
1,must,0.438611
2,good,0.430333
3,country,0.421318
4,leadership,0.412627
5,sure,0.411185
6,making,0.407892
7,think,0.407853
8,time,0.405538
9,hard,0.401611


In [None]:
pd.DataFrame(sorted_sim_scores, columns =['word', 'sim score']).tail(10)

Unnamed: 0,word,sim score
105,decisiveness,0.090159
106,sociology,0.08688
107,idiot,0.083248
108,prejudices,0.082516
109,underprivileged,0.064474
110,approachable,0.047592
111,trustworthiness,0.039461
112,ulterior,0.03745
113,problem_solving,0.0
114,common_sense,0.0


### Experiment B3 - repeating above with expanded vocab - top 10 similar words of each word in question and in data vocab

In [None]:
sim_word_list = []

for word in word_list:
  if word in model2.vocab:
    t = []
    for i in model2.wv.most_similar(word,topn=10):
      if i not in stopwords.words('english'):
        t.append(i[0])
    sim_word_list.append([word, *t])

  


In [None]:
sim_ques_list = []
for word in ques_tokens:
  if word in model2.vocab:
    t = []
    for i in model2.wv.most_similar(word,topn=10):
      if i not in stopwords.words('english'):
        t.append(i[0])
    sim_ques_list.append([word, *t])

  """


In [None]:
ext_sim_score_list = []
for i in sim_word_list:
  t = 0
  for j in i:
    for k in sim_ques_list:
      for l in k:
        t = t + model2.wv.similarity(w1 = j, w2 = l)
  ext_sim_score_list.append([i[0],t/(len(i)*len(sim_ques_list)*len(k))])

sorted_ext_sim_scores = sorted(ext_sim_score_list, key = lambda x: x[1])
sorted_ext_sim_scores.reverse()

  import sys


In [None]:
predicted_words = []
for i in range(10):
  predicted_words.append(sorted_ext_sim_scores[i][0])

score(predicted_words)

0.7240026123071596

In [None]:
pd.DataFrame(sorted_ext_sim_scores, columns =['word', 'sim score']).head(10)

Unnamed: 0,word,sim score
0,must,0.473679
1,well,0.458662
2,able,0.453295
3,making,0.44689
4,time,0.436441
5,good,0.434484
6,work,0.430493
7,working,0.428832
8,together,0.428684
9,sure,0.428365


In [None]:
pd.DataFrame(sorted_ext_sim_scores, columns =['word', 'sim score']).tail(10)

Unnamed: 0,word,sim score
103,eloquent,0.086578
104,agreeable,0.069391
105,selfless,0.068666
106,ulterior,0.058634
107,trustworthiness,0.047423
108,prejudices,0.045726
109,underprivileged,0.043557
110,decisiveness,0.040302
111,approachable,0.019465
112,idiot,0.007794


### Experiment B4  - Finding the words in given data set that are most similar with other words and words similar ot those words (using model2) of the data set

In [None]:
ext_sim_score_list_2 = []
for i in sim_word_list:
  t = 0
  for j in sim_word_list:
    for k in j:
      t = t + model2.wv.similarity(w1 = i[0], w2 = k)
  ext_sim_score_list_2.append([i[0],t/(11*len(sim_word_list))])

sorted_ext_sim_scores_2 = sorted(ext_sim_score_list_2, key = lambda x: x[1])
sorted_ext_sim_scores_2.reverse()

  


In [None]:
predicted_words = []
for i in range(10):
  predicted_words.append(sorted_ext_sim_scores_2[i][0])

score(predicted_words)

0.8749974263419144

In [None]:
pd.DataFrame(sorted_ext_sim_scores_2, columns =['word', 'sim score']).head(10)

Unnamed: 0,word,sim score
0,good,0.466828
1,well,0.460021
2,think,0.436241
3,work,0.432424
4,sure,0.432111
5,experience,0.426766
6,making,0.422035
7,must,0.420952
8,time,0.419104
9,hard,0.412615


In [None]:
pd.DataFrame(sorted_ext_sim_scores_2, columns =['word', 'sim score']).tail(10)

Unnamed: 0,word,sim score
103,agreeable,0.152763
104,idiot,0.139266
105,decisiveness,0.126121
106,u,0.12077
107,underprivileged,0.117896
108,negotiator,0.113906
109,approachable,0.111753
110,sociology,0.111632
111,trustworthiness,0.111473
112,ulterior,0.081883


### Experiment B5 - Every quality that we desire would be the most associated word with similar words of the words of the data set. This is because in Wiki corpus, a word representing "quality" would be around words associated with "quality" which result as similar words to the "qualities"

In [None]:
ass_words = []

for i in word_list:
  if i in model2.vocab:
    for j in model2.wv.most_similar(i,topn=10):
      if j not in stopwords.words('english'):
        ass_words.append(j[0])

  """


In [None]:
ass_word_sim = []
for i in word_list:
  t = 0
  for j in ass_words:
    if i in model2.vocab and j in model2.vocab:
      t = t + model2.wv.similarity(w1 = i, w2 = j)
  ass_word_sim.append([i,t])

ass_word_scores = sorted(ass_word_sim, key = lambda x: x[1])
ass_word_scores.reverse()

  


In [None]:
predicted_words = []
for i in range(10):
  predicted_words.append(ass_word_scores[i][0])

score(predicted_words)

0.9420551197550543

In [None]:
pd.DataFrame(ass_word_scores, columns =['word', 'sim score']).head(10)

Unnamed: 0,word,sim score
0,good,525.266888
1,well,518.793941
2,think,492.378579
3,sure,487.746686
4,work,487.295682
5,experience,479.904685
6,making,476.445741
7,must,474.747539
8,time,473.562529
9,lot,465.305


In [None]:
pd.DataFrame(ass_word_scores, columns =['word', 'sim score']).tail(10)

Unnamed: 0,word,sim score
105,decisiveness,140.430669
106,u,135.497159
107,underprivileged,131.20513
108,negotiator,126.762331
109,approachable,124.952654
110,trustworthiness,123.728177
111,sociology,123.640358
112,ulterior,91.804203
113,problem_solving,0.0
114,common_sense,0.0


### Experiment B6 - KMeans

In [None]:
X = model2[model2.wv.vocab]
vectorized_docs = vectorize(data, model=model2)

clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=13,
    mb=1000,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    #"text": docs,
    "tokens": [" ".join(text) for text in data]
    #"cluster": cluster_labels
})

  """Entry point for launching an IPython kernel.


For n_clusters = 13
Silhouette coefficient: 0.07
Inertia:87.95091373169515
Silhouette values:
    Cluster 2: Size:2 | Avg:0.38 | Min:0.32 | Max: 0.44
    Cluster 12: Size:8 | Avg:0.13 | Min:0.03 | Max: 0.22
    Cluster 3: Size:5 | Avg:0.11 | Min:0.06 | Max: 0.15
    Cluster 10: Size:2 | Avg:0.09 | Min:0.07 | Max: 0.11
    Cluster 11: Size:2 | Avg:0.08 | Min:0.06 | Max: 0.09
    Cluster 8: Size:4 | Avg:0.06 | Min:-0.08 | Max: 0.14
    Cluster 4: Size:3 | Avg:0.01 | Min:0.00 | Max: 0.03
    Cluster 0: Size:1 | Avg:0.00 | Min:0.00 | Max: 0.00
    Cluster 6: Size:1 | Avg:0.00 | Min:0.00 | Max: 0.00
    Cluster 7: Size:1 | Avg:0.00 | Min:0.00 | Max: 0.00
    Cluster 9: Size:1 | Avg:0.00 | Min:0.00 | Max: 0.00
    Cluster 5: Size:3 | Avg:-0.02 | Min:-0.11 | Max: 0.04
    Cluster 1: Size:5 | Avg:-0.06 | Min:-0.12 | Max: 0.04




In [None]:
print("Most representative terms per cluster (based on centroids):")
tokens_per_cluster = ""
most_representative = model2.wv.most_similar(positive=[clustering.cluster_centers_[2]], topn=10)
for t in most_representative:
    tokens_per_cluster += f"{t[0]} "
print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 7: integrity honesty professionalism competence fairness respect courage determination dignity credibility 


  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
predicted_words = []
model_2_words = tokenizer.tokenize(tokens_per_cluster)
for i in range(10):
  predicted_words.append(model_2_words[0])
score(predicted_words)

1.1030320061357328