In [28]:
import pandas as pd
import gensim.downloader as api
import csv
import random

In [35]:
# Methods
def select_answer(model, guess, choice1, choice2, choice3, choice4):
    choices = [choice1, choice2, choice3, choice4]
    choice1_similarity = model.similarity(guess, choice1)
    choice2_similarity = model.similarity(guess, choice2)
    choice3_similarity = model.similarity(guess, choice3)
    choice4_similarity = model.similarity(guess, choice4)
    similarity = [choice1_similarity, choice2_similarity, choice3_similarity, choice4_similarity]
    
    answer = similarity.index(max(similarity))
    final_choice = choices[answer]
    return final_choice

def random_guess(choice1, choice2, choice3, choice4):
    choices = [choice1, choice2, choice3, choice4]
    return choices[random.randint(0,3)]

def create_model_details(model_name):
    with open(model_name, 'w+') as f:
        writer = csv.writer(f)
        field = ['question', 'answer', 'guess', 'label']
        writer.writerow(field)

def test_model(model_name, model, questions, answers, choice1, choice2, choice3, choice4):
    correct_guesses = 0
    random_guesses = 0

    for i in range(len(questions)):
        question = questions[i]
        answer = answers[i]
        
        if question in model:
            guess = model.most_similar(question)[0][0]
            choice = select_answer(model ,guess, choice1[i], choice2[i], choice3[i], choice4[i])
            if choice == answer:
                correct_guesses += 1
                label = 'correct'
            else:
                label = 'wrong'
        else: 
            choice = random_guess(choice1[i], choice2[i], choice3[i], choice4[i])
            random_guesses += 1
            label = 'guess'
        
        with open(model_name, 'a') as f:
            writer = csv.writer(f)
            row = [question, answer, choice, label]
            writer.writerow(row)
    return correct_guesses, random_guesses

def write_model_analysis(model_name, size_of_vocab, number_of_correct, number_of_non_random_guess, accuracy):
    with open('analysis.csv', 'a') as f:
        writer = csv.writer(f)
        row = [model_name, size_of_vocab, number_of_correct, number_of_non_random_guess, accuracy]
        writer.writerow(row)

# Analysis
with open('analysis.csv', 'w+') as f:
    writer = csv.writer(f)
    field = ['model_name', 'size_of_vocab', 'number_of_correct', 'number_of_non_random_guess', 'accuracy']
    writer.writerow(field)

### Task 1 - Evaluation of the word2vec-google-news-300 Pre-trained Model

In [30]:
# Word2Vec gooogle news model
wv_google = api.load('word2vec-google-news-300')
file_name_0 = 'word2vec-google-news-300.csv'

In [31]:
# Get dataset
df = pd.read_csv('synonym.csv')
questions = df['question'].tolist()
answers = df['answer'].tolist()

choice1 = df['0'].tolist()
choice2 = df['1'].tolist()
choice3 = df['2'].tolist()
choice4 = df['3'].tolist()

#### 1. Guessing

In [32]:
# Model guessing the correct answers
create_model_details(file_name_0)
correct_guesses_google300, random_guesses_google300 = test_model(file_name_0, wv_google, questions, answers, choice1, choice2, choice3, choice4)

#### 2. Analysis

In [36]:
# Word2Vec google news 300 model
model_name_google300 = 'word2vec-google-news-300'
size_of_vocab_google300 = len(wv_google.key_to_index)
number_of_correct_google300 = correct_guesses_google300
number_of_non_random_guess = len(questions) - random_guesses_google300
accuracy_google300 = correct_guesses_google300 / number_of_non_random_guess

write_model_analysis(model_name_google300, size_of_vocab_google300, number_of_correct_google300, number_of_non_random_guess, accuracy_google300)

### Task 2 - Comparison with other pre-trained models

In [37]:
# 1. 2 new models from different corpora but same embedding size
wv_ruscorpora = api.load('word2vec-ruscorpora-300')
glove_wiki = api.load('glove-wiki-gigaword-300')

# 2. 2 new models from different embedding size but same corpus
glove_twitter50 = api.load('glove-twitter-50')
glove_twitter200 = api.load('glove-twitter-200')

### Task 3 - Train own models