Imports

In [1]:
import gensim.downloader as api
import pandas as pd
import numpy as np
from IPython.core.display import HTML
import time
import sys

Load "word2vec-google-news-300" pre-trained embedding model

In [3]:
#info = api.info()  # show info about available models/datasets
#print(info)

# obtain pre-trained Word2Vec model
model_name = "word2vec-google-news-300"
model = api.load(model_name)

print("Test purpose - finding similar word")
print(model.most_similar("enormously", topn=3))

Test purpose - finding similar word
[('immensely', 0.8415359258651733), ('tremendously', 0.8185792565345764), ('hugely', 0.7856297492980957)]


Load "synonyms.csv" dataset

In [4]:
# Load synonyms.csv dataset
df = pd.read_csv("synonyms.csv")

# display data frame
HTML(df.to_html())

Unnamed: 0,question,answer,0,1,2,3
0,enormously,tremendously,appropriately,uniquely,tremendously,decidedly
1,provisions,stipulations,stipulations,interrelations,jurisdictions,interpretations
2,haphazardly,randomly,dangerously,densely,randomly,linearly
3,prominent,conspicuous,battered,ancient,mysterious,conspicuous
4,zenith,pinnacle,completion,pinnacle,outset,decline
5,flawed,imperfect,tiny,imperfect,lustrous,crude
6,urgently,desperately,typically,conceivably,tentatively,desperately
7,consumed,eaten,bred,caught,eaten,supplied
8,advent,coming,coming,arrest,financing,stability
9,concisely,succinctly,powerfully,positively,freely,succinctly


Compute the closest synonym for each word in "synonms.csv" dataset using Similarity method

In [53]:
# new dataframe created to store closest synonyms with the question word 
# which is to be found from system model
details_df = pd.DataFrame(columns=['question', 'answer', 'guess', 'label'])

# iterate each data in dataframe by rows
for index, row in df.iterrows():
    
    # create dictionary to store accuracy value 
    # for similarity between question-word and each available guess-words
    accuracy_dict = {}
    
    # assign each elements in row to appropriate variables
    question = row["question"]
    answer = row["answer"]
    guess_0 = row["0"]
    guess_1 = row["1"]
    guess_2 = row["2"]
    guess_3 = row["3"]
    
    # if question-word and answer-word from synonyms.csv found in word2vec model:
    if (question in model) and (answer in model):
        if guess_0 in model:
            accuracy = model.similarity(question, guess_0)  # calculate cosine similarity
            accuracy_dict[guess_0] = accuracy               # append both word and accuracy to dictionary
        if guess_1 in model:
            accuracy = model.similarity(question, guess_1)
            accuracy_dict[guess_1] = accuracy
        if guess_2 in model:
            accuracy = model.similarity(question, guess_2)
            accuracy_dict[guess_2] = accuracy
        if guess_3 in model:
            accuracy = model.similarity(question, guess_3)
            accuracy_dict[guess_3] = accuracy
        
        # sort the accuracy dictionary by keys with higher accuracy value (descending)
        sorted_accuracy_tuples = sorted(accuracy_dict.items(), key=lambda item: item[1], reverse=True)
        sorted_accuracy_dict = {key: value for key, value in sorted_accuracy_tuples}
        #print(sorted_accuracy_dict)
        
        # obtain the guess word with highest accuracy value (first element in sorted dictionary!)
        guess_word = next(iter(sorted_accuracy_dict))
        
        # check if word guessed by system is equal to the answer in synonyms.csv:
        if guess_word == answer:
            label = "correct"
        else:
            label = "wrong"
    
    # else if question-word and answer-word not found in word2vec model:
    else:
        label = "guess"
        guess_word = ""
    
    # append all the data (question-word, answer-word, guess-word by system, label) to new dataframe
    details_df.loc[index] = [question, answer, guess_word, label]
    
# display new dataframe
HTML(details_df.to_html())

Unnamed: 0,question,answer,guess,label
0,enormously,tremendously,tremendously,correct
1,provisions,stipulations,stipulations,correct
2,haphazardly,randomly,randomly,correct
3,prominent,conspicuous,conspicuous,correct
4,zenith,pinnacle,pinnacle,correct
5,flawed,imperfect,imperfect,correct
6,urgently,desperately,desperately,correct
7,consumed,eaten,eaten,correct
8,advent,coming,coming,correct
9,concisely,succinctly,succinctly,correct


In [61]:
# new dataframe created to system model analysis
analysis_df = pd.DataFrame(columns=['model_name', 'corpus_size', '#correct_labels', '#questions_answered_no_guess', 'accuracy_model'])

vocab_size = len(model)     # size of the vocabulary(corpus) in model
num_labels_dict = details_df["label"].value_counts()    # returns dictionary with frequency elements in all labels
num_correct_labels = num_labels_dict["correct"]         # number of all elements with correct label
V = num_labels_dict["correct"] + num_labels_dict["wrong"]   # number of all elements with no guess label
accuracy_model_val = num_correct_labels / V             # accuracy model value (C/V)

# append all the data (model_name, corpus_size, #correct_labels, #questions_answered_no_guess, accuracy_model) 
# to new dataframe
analysis_df.loc[0] = [model_name, vocab_size, num_correct_labels, V, accuracy_model_val]

# display new dataframe
HTML(analysis_df.to_html())

Unnamed: 0,model_name,corpus_size,#correct_labels,#questions_answered_no_guess,accuracy_model
0,word2vec-google-news-300,3000000,70,79,0.886076


Save data obtained to csv files

In [62]:
# save model labels to -details.csv
file_name_details = model_name + "-details.csv"
details_df.to_csv(file_name_details, index=False)

# save model analysis to -analysis.csv
file_name_analysis = "analysis.csv"
analysis_df.to_csv(file_name_analysis, index=False)