In [1]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Load data 
df = pd.read_csv("data/all_preprocessed_tasks_EN.csv") 
df = df.dropna(subset=["description"]).reset_index()
data = df[df["word_count"] > 4]["description"].tolist() # removing test descriptions
sentences_tokens = [sentence.lower().split() for sentence in data]
df.head()

Unnamed: 0,index,taskId,language,description,topic_id,word_count
0,0,9oqJmtbKXts6Rr9Szw4OIS,eng,cours client book swiss connect academi,,6
1,1,aOrgjKFodXC7uGMKqdMKMg,eng,empti respon,,2
2,2,a0pzxEfKq8c9D0dRZlQcm9,eng,write rule astronaut use condit sentenc exampl...,,20
3,3,9Hjn2yUwBcs7DZK6HARkE4,eng,guess frequent speak languag switzerland use m...,,14
4,4,6AYw9CEZMTN7LN8u0LfYVb,eng,complet sentenc go exampl plan tonight rain wa...,,16


> **Distribution of words:** Refer to 4_concat_data.ipynb

**CBOW** <br>
The CBOW model learns to predict a target word leveraging all words in its neighborhood. The sum of the context vectors are used to predict the target word.

In [3]:
# Initialize Word2Vec CBOW model
# TODO to adjust min_count (to remove words with very little occurrence), 
# plot distribution of words first, and make a studied decision
# TODO tweak arg `window` = Maximum distance between the current and predicted word within a sentence.
# (=== window=1 takes into account one word at each side of the word to predict)
cbow_model = Word2Vec(sentences=sentences_tokens, min_count=1, vector_size=100, window=1)

In [4]:
# Train model 
cbow_model.train(data, total_examples=len(data), epochs=1)

word_vectors = cbow_model.wv.vectors
# length of word_vectors is equal to the number of words in the bag of vocabulary
len(word_vectors), len(data)

(3399, 1445)

In [5]:
# We compute the embedding of the entire sentence as the average of the embeddings of its words. 
# One of the problems of this approach is that it ignores the order of the words in the sentences. 
# Another problem is that all words are given the same weight. 
# The third issue is information loss. 
# https://www.baeldung.com/cs/sentence-vectors-word2vec 
def meanEmbedding(model, words):
    # remove out-of-vocabulary words
    vocab = model.wv.index_to_key
    words = [word for word in words if word in vocab]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return []

df["meanEmbedding"] = df["description"].apply(
    lambda text: meanEmbedding(
        model=cbow_model, 
        words=text.split()))

In [16]:
# Weighted Averaging 
# Use TF-IDF score as the weight of each word (frequent words have a smaller TF-IDF score) 
# Solves problem of equal importance given to all words with normal averaging, but doesn't fix the order of words problem. 
def get_tfidf(data):
    # Vectorizer 
    vectorizer = TfidfVectorizer(
        max_df=0.9,
        min_df=0.01,
        stop_words=stopwords.words('english'),
    )

    tfidf = vectorizer.fit_transform(data)
    tfidf_word_list = vectorizer.get_feature_names_out()
    # print(f"n_samples: {tfidf.shape[0]}, n_features: {tfidf.shape[1]}")
    return tfidf, tfidf_word_list

def weightedMeanEmbedding(model, tfidf, tfidf_word_list, words, doc_idx):
    # remove out-of-vocabulary words
    vocab = model.wv.index_to_key
    words = [word for word in words if word in vocab]
    scores = []
    for word in words: 
        if word in tfidf_word_list: 
            word_idx = list(tfidf_word_list).index(word)
            score = tfidf[doc_idx, word_idx]
        else: 
            score = 0 
        scores.append(score)
    if len(words) >= 1:
        return np.mean(model.wv[words]*np.array(scores).reshape((-1,1)), axis=0)
    else:
        return []


tfidf, tfidf_word_list = get_tfidf(df.description.to_list())

df["weightedMeanEmbedding"] = df.apply(
    lambda row: weightedMeanEmbedding(
        model=cbow_model, 
        tfidf=tfidf,
        tfidf_word_list=tfidf_word_list,
        words=row["description"].split(), 
        doc_idx=row.name), axis=1)

In [None]:
# Deep averaging network 
# Still ignorant about the syntax and order of words of a sentence. 
# despite its simplicity and lightness in contrast with syntactically-aware models, 
# such as recursive neural networks, deep averaging networks perform comparably. 


**Clustering**

In [13]:
from sklearn.cluster import KMeans

word_vectors = cbow_model.wv.vectors

num_clusters = 5

# Initalize a k-means object and use it to extract centroids

kmeans_clustering = KMeans( n_clusters = num_clusters )

idx = kmeans_clustering.fit_predict( word_vectors )



In [37]:
word_centroid_map = dict(zip(cbow_model.wv.key_to_index, idx ))

for cluster in range(0,5):
    # Print the cluster number
    print("\nCluster %d" % cluster)
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(0, len(list(word_centroid_map.values()))):
        vals = word_centroid_map.values()
        if(list(vals)[i] == cluster):
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['go', 'say', 'get', 'see', 'come', 'man', 'look', 'luke', 'find', 'day', 'chri', 'think', 'know', 'could', 'still', 'help', 'gwen', 'back', 'friend', 'two', 'eye', 'we', 'walk', 'bu', 'realli', 'ask', 'tell', 'around', 'put', 'run', 'start', 'dad', 'play', 'need', 'night', 'open', 'hear', 'home', 'gun', 'phone', 'sound', 'boy', 'laugh', 'big', 'let', 'stop', 'happen', 'dog', 'today', 'gordi', 'high', 'number', 'terribl', 'billi', 'fall', 'dylan', 'everyon', 'hand', 'door', 'mayb', 'sherlock', 'robert', 'footbal', 'danc', 'nois', 'thank', 'across', 'abbi', 'long', 'teddi', 'kitchen', 'upset', 'shruti', 'everi', 'aw', 'excit', 'forgot', 'noth', 'park', 'sing', 'villag', 'tunnel', 'mile', 'ok', 'floor', 'susan', 'must', 'aunt', 'suddenli', 'hate', 'stand', 'rememb', 'far', 'alreadi', 'oh', 'shout', 'hard', 'voyag', 'money', 'pizza', 'surpris', 'shop', 'scott', 'lucki', 'dark', 'moment', 'cow', 'pick', 'watch', 'mum', 'least', 'turn', 'game', 'trip', 'forget', 'light', 'arriv',

<hr>

**SKIPGRAM** <br> 
The SkipGram model on the other hand, learns to predict a word based on a neighboring word. To put it simply, given a word, it learns to predict another word in it’s context.

In [22]:
# Word2Vec Skip Gram model
sg_model = Word2Vec(sentences=sentences, min_count=10, vector_size=100, window=5, sg=1)

In [26]:
# sg_model.wv.key_to_index

In [27]:
sims = sg_model.wv.most_similar(['house'], topn=10)  # get other similar words
sims

[('mango', 0.8317583799362183),
 ('cisneros,', 0.8169061541557312),
 ('sandra', 0.8010516166687012),
 ('justice', 0.7969511151313782),
 ('from:', 0.7728200554847717),
 ('bombed', 0.7447158694267273),
 ('law', 0.7269050478935242),
 ('2009', 0.7186321020126343),
 ('indian', 0.7171280384063721),
 ('jr.', 0.7159419059753418)]