Imports

In [9]:
import gensim.downloader as api
import pandas as pd
import numpy as np
from IPython.core.display import HTML
from sklearn.decomposition import PCA
from matplotlib import pyplot
import os

Show all available pre-trained dataset from Gensim

In [10]:
available_models = api.info()['models'].keys()
for model in available_models:
    print(model)

fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis


Load pre-trained embedding model

In [11]:
def load_model(model_name):    
    # obtain pre-trained Word2Vec model
    print("...Model Loading...")
    model = api.load(model_name)
    print("...Model Loaded...")
    
    return model

Load dataset

In [12]:
def load_csv(csv_file):
    # Load synonyms.csv dataset
    df = pd.read_csv(csv_file)

    # display data frame
    HTML(df.to_html())
    
    return df

Compute the closest synonym for each word in loaded csv dataset using Similarity method

In [13]:
def details(df, model):
    # new dataframe created to store closest synonyms with the question word 
    # which is to be found from system model
    details_df = pd.DataFrame(columns=['question', 'answer', 'guess', 'label'])

    # iterate each data in dataframe by rows
    for index, row in df.iterrows():
        
        # create dictionary to store accuracy value 
        # for similarity between question-word and each available guess-words
        accuracy_dict = {}
        
        # assign each elements in row to appropriate variables
        question = row["question"]
        answer = row["answer"]
        guess_0 = row["0"]
        guess_1 = row["1"]
        guess_2 = row["2"]
        guess_3 = row["3"]
        
        # if question-word and answer-word from synonyms.csv found in word2vec model:
        if (question in model) and (answer in model):
            if guess_0 in model:
                accuracy = model.similarity(question, guess_0)  # calculate cosine similarity
                accuracy_dict[guess_0] = accuracy               # append both word and accuracy to dictionary
            if guess_1 in model:
                accuracy = model.similarity(question, guess_1)
                accuracy_dict[guess_1] = accuracy
            if guess_2 in model:
                accuracy = model.similarity(question, guess_2)
                accuracy_dict[guess_2] = accuracy
            if guess_3 in model:
                accuracy = model.similarity(question, guess_3)
                accuracy_dict[guess_3] = accuracy
            
            # sort the accuracy dictionary by keys with higher accuracy value (descending)
            sorted_accuracy_tuples = sorted(accuracy_dict.items(), key=lambda item: item[1], reverse=True)
            sorted_accuracy_dict = {key: value for key, value in sorted_accuracy_tuples}
            #print(sorted_accuracy_dict)
            
            # obtain the guess word with highest accuracy value (first element in sorted dictionary!)
            guess_word = next(iter(sorted_accuracy_dict))
            
            # check if word guessed by system is equal to the answer in synonyms.csv:
            if guess_word == answer:
                label = "correct"
            else:
                label = "wrong"
        
        # else if question-word and answer-word not found in word2vec model:
        else:
            label = "guess"
            guess_word = ""
        
        # append all the data (question-word, answer-word, guess-word by system, label) to new dataframe
        details_df.loc[index] = [question, answer, guess_word, label]

    return details_df

Save data obtained to csv files

In [15]:
def save_csv(details_df, model_name):
    cwd = os.getcwd()
    directory = cwd + "/outputs"

    # Create target Directory
    try:
        os.mkdir(directory)
        print("Output Directory Created ")
    except FileExistsError:
        print("Output Directory already exists")

    print("Files will be stored in directory: " + directory)
        
    # save model labels to -details.csv
    file_name_details = "-details.csv"
    full_path_details = directory + model_name + file_name_details
    details_df.to_csv(full_path_details, index=False)
    print("File: " + file_name_details + " stored!")

Main(): Load & Execute All Models

In [16]:
# Task 1: Load "word2vec-google-news-300" word2vec pre-trained model
# Task 2: Load 4 other English any pre-trained models
#   -   2 new models from different corpora but same embedding size
#   -   2 new models from the same corpus but different embedding sizes
model_names = [ 
    ["glove-twitter-200", "glove-wiki-gigaword-200"],
    ["glove-twitter-25", "glove-twitter-50"]
]

models = []

# Load synonyms.csv dataset
df = load_csv("synonyms.csv")

# Load pre-trained model, compute similarity, and output results to csv files
for index, task in enumerate(model_names):
    if index == 0:
        print("======== Executing Task 2(1): different corpora & same embedding size ========")
    else:
        print("======== Executing Task 2(2): same corpus & different embedding size ========")
    for model_name in task:
        print("--> '" + model_name + "'")
        
        # Load pre-trained model
        model = load_model(model_name)
        models.append(model)

        # create details dataframe which computes the 
        # closest synonym for each word in loaded csv dataset
        details_df = details(df, model)
        # display new dataframe
        HTML(details_df.to_html())

        # save both data to csv files
        save_csv(details_df, model_name)

--> 'glove-twitter-200'
...Model Loading...
...Model Loaded...
Output Directory already exists
Files will be stored in directory: c:\Users\rhina\OneDrive\Documents\GitHub\COMP472-Fall2021-A3/outputs/
File: -details.csv stored!
File: analysis.csv stored!
--> 'glove-wiki-gigaword-200'
...Model Loading...
...Model Loaded...
Output Directory already exists
Files will be stored in directory: c:\Users\rhina\OneDrive\Documents\GitHub\COMP472-Fall2021-A3/outputs/
File: -details.csv stored!
File: analysis.csv stored!
--> 'glove-twitter-25'
...Model Loading...
...Model Loaded...
Output Directory already exists
Files will be stored in directory: c:\Users\rhina\OneDrive\Documents\GitHub\COMP472-Fall2021-A3/outputs/
File: -details.csv stored!
File: analysis.csv stored!
--> 'glove-twitter-50'
...Model Loading...
...Model Loaded...
Output Directory already exists
Files will be stored in directory: c:\Users\rhina\OneDrive\Documents\GitHub\COMP472-Fall2021-A3/outputs/
File: -details.csv stored!
File: a

Plot Word Vectors using PCA

In [17]:
# fit a 2-dimensional PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

AttributeError: 'KeyedVectors' object has no attribute 'wv'