In [69]:
import pandas as pd
from gensim.models import Word2Vec
import gensim.downloader as api
import csv
import random
import nltk
from nltk import sent_tokenize, word_tokenize


In [70]:
# Methods
def select_answer(model, guess, choice1, choice2, choice3, choice4):
    choices = [choice1, choice2, choice3, choice4]
    if choice1 in model:
        choice1_similarity = model.similarity(guess, choice1)
    else:
        choice1_similarity = 0
    
    if choice2 in model:
        choice2_similarity = model.similarity(guess, choice2)
    else:
        choice2_similarity = 0
    
    if choice3 in model:
        choice3_similarity = model.similarity(guess, choice3)
    else:
        choice3_similarity = 0
    
    if choice4 in model:
        choice4_similarity = model.similarity(guess, choice4)
    else:
        choice4_similarity = 0
    
    similarity = [choice1_similarity, choice2_similarity, choice3_similarity, choice4_similarity]
    
    answer = similarity.index(max(similarity))
    final_choice = choices[answer]
    return final_choice

def random_guess(choice1, choice2, choice3, choice4):
    choices = [choice1, choice2, choice3, choice4]
    return choices[random.randint(0,3)]

def create_model_details(model_name):
    with open(model_name, 'w+') as f:
        writer = csv.writer(f)
        field = ['question', 'answer', 'guess', 'label']
        writer.writerow(field)

def test_model(model_name, model, questions, answers, choice1, choice2, choice3, choice4):
    correct_guesses = 0
    random_guesses = 0

    for i in range(len(questions)):
        question = questions[i]
        answer = answers[i]
        
        if question in model:
            guess = model.most_similar(question)[0][0]
            choice = select_answer(model, guess, choice1[i], choice2[i], choice3[i], choice4[i])
            if choice == answer:
                correct_guesses += 1
                label = 'correct'
            else:
                label = 'wrong'
        else: 
            choice = random_guess(choice1[i], choice2[i], choice3[i], choice4[i])
            random_guesses += 1
            label = 'guess'
        
        with open(model_name, 'a') as f:
            writer = csv.writer(f)
            row = [question, answer, choice, label]
            writer.writerow(row)
    return correct_guesses, random_guesses

def write_model_analysis(model_name, size_of_vocab, number_of_correct, number_of_non_random_guess, accuracy):
    with open('analysis.csv', 'a') as f:
        writer = csv.writer(f)
        row = [model_name, size_of_vocab, number_of_correct, number_of_non_random_guess, accuracy]
        writer.writerow(row)

# Analysis
with open('analysis.csv', 'w+') as f:
    writer = csv.writer(f)
    field = ['model_name', 'size_of_vocab', 'number_of_correct', 'number_of_non_random_guess', 'accuracy']
    writer.writerow(field)

### Task 1 - Evaluation of the word2vec-google-news-300 Pre-trained Model

In [71]:
# Word2Vec gooogle news model
wv_google = api.load('word2vec-google-news-300')
file_name_0 = 'word2vec-google-news-300.csv'

In [72]:
# Get dataset
df = pd.read_csv('synonym.csv')
questions = df['question'].tolist()
answers = df['answer'].tolist()

choice1 = df['0'].tolist()
choice2 = df['1'].tolist()
choice3 = df['2'].tolist()
choice4 = df['3'].tolist()

#### 1. Guessing

In [73]:
# Model guessing the correct answers
create_model_details(file_name_0)
correct_guesses_google300, random_guesses_google300 = test_model(file_name_0, wv_google, questions, answers, choice1, choice2, choice3, choice4)

#### 2. Analysis

In [74]:
# Word2Vec google news 300 model
model_name_google300 = 'word2vec-google-news-300'
size_of_vocab_google300 = len(wv_google.key_to_index)
number_of_correct_google300 = correct_guesses_google300
number_of_non_random_guess = len(questions) - random_guesses_google300
accuracy_google300 = correct_guesses_google300 / number_of_non_random_guess

write_model_analysis(model_name_google300, size_of_vocab_google300, number_of_correct_google300, number_of_non_random_guess, accuracy_google300)

### Task 2 - Comparison with other pre-trained models

#### 1. 2 new models from different corpora but same embedding size

glove-twitter-200 <br>
glove-wiki-gigaword-300

In [75]:
glove_twitter200 = api.load('glove-twitter-200')
file_name_1 = 'glove-twitter-200.csv'

glove_wiki200 = api.load('glove-wiki-gigaword-200')
file_name_2 = 'glove-wiki-gigaword-200.csv'

In [76]:
# Glove twitter 200 model guessing and analytics
create_model_details(file_name_1)
correct_guesses_twitter200, random_guesses_twitter200 = test_model(file_name_1, glove_twitter200, questions, answers, choice1, choice2, choice3, choice4)

model_name_twitter200 = 'glove-twitter-200'
size_of_vocab_twitter200 = len(glove_twitter200.key_to_index)
number_of_correct_twitter200 = correct_guesses_twitter200
number_of_non_random_guess_twitter200 = len(questions) - random_guesses_twitter200
accuracy_twitter200 = correct_guesses_twitter200 / number_of_non_random_guess_twitter200

write_model_analysis(model_name_twitter200, size_of_vocab_twitter200, number_of_correct_twitter200, number_of_non_random_guess_twitter200, accuracy_twitter200)

In [77]:
# Glove wiki 200 model guessing and analytics
create_model_details(file_name_2)
correct_guesses_wiki200, random_guesses_wiki200 = test_model(file_name_2, glove_wiki200, questions, answers, choice1, choice2, choice3, choice4)

model_name_wiki200 = 'glove-wiki-200'
size_of_vocab_wiki200 = len(glove_wiki200.key_to_index)
number_of_correct_wiki200 = correct_guesses_wiki200
number_of_non_random_guess_wiki200 = len(questions) - random_guesses_wiki200
accuracy_wiki200 = correct_guesses_wiki200 / number_of_non_random_guess_wiki200

write_model_analysis(model_name_wiki200, size_of_vocab_wiki200, number_of_correct_wiki200, number_of_non_random_guess_wiki200, accuracy_wiki200)

#### 2. 2 new models with different embdedding size but same corpus

glove-twitter-50 <br>
glove-twitter-100

In [78]:
glove_twitter50 = api.load('glove-twitter-50')
file_name_3 = 'glove-twitter-50.csv'

glove_twitter100 = api.load('glove-twitter-100')
file_name_4 = 'glove-twitter-100.csv'

In [79]:
# Glove twitter 50 model guessing and analytics
create_model_details(file_name_3)
correct_guesses_twitter50, random_guesses_twitter50 = test_model(file_name_3, glove_twitter50, questions, answers, choice1, choice2, choice3, choice4)

model_name_twitter50 = 'glove-twitter-50'
size_of_vocab_twitter50 = len(glove_twitter50.key_to_index)
number_of_correct_twitter50 = correct_guesses_twitter50
number_of_non_random_guess_twitter50 = len(questions) - random_guesses_twitter50
accuracy_twitter50 = correct_guesses_twitter50 / number_of_non_random_guess_twitter50

write_model_analysis(model_name_twitter50, size_of_vocab_twitter50, number_of_correct_twitter50, number_of_non_random_guess_twitter50, accuracy_twitter50)

# Glove twitter 100 model guessing and analytics
create_model_details(file_name_4)
correct_guesses_twitter100, random_guesses_twitter100 = test_model(file_name_4, glove_twitter100, questions, answers, choice1, choice2, choice3, choice4)

model_name_twitter100 = 'glove-twitter-100'
size_of_vocab_twitter100 = len(glove_twitter100.key_to_index)
number_of_correct_twitter100 = correct_guesses_twitter100
number_of_non_random_guess_twitter100 = len(questions) - random_guesses_twitter100
accuracy_twitter100 = correct_guesses_twitter100 / number_of_non_random_guess_twitter100

write_model_analysis(model_name_twitter100, size_of_vocab_twitter100, number_of_correct_twitter100, number_of_non_random_guess_twitter100, accuracy_twitter100)

### Task 3 - Train own models

In [80]:
nltk.download('punkt')
books = ['book1_theakkracase.txt', 'book2_aliceinwonderland.txt', 'book3_thepictureofdoriangray.txt', 'book4_theadventuresofsherlockholmes.txt', 'book5_thegreatgatsby.txt', 'book6_modestproposal.txt', 'book7_metamorphosis.txt']
book_sentences = {}

for book in books:
    with open(book, 'r', encoding='utf-8') as f:
        text = f.read()
        sentences = sent_tokenize(text)
        # print(sentences[0])
        book_sentences[book] = [word_tokenize(sentence.lower()) for sentence in sentences]


flat_list = [item for sublist in book_sentences.values() for item in sublist]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rck20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Different window size 5 and 10
model_5_100 = Word2Vec(flat_list, window=5, vector_size=100, workers=4)
model_10_100 = Word2Vec(flat_list, window=10, vector_size=100, workers=4)
# print(model_5_100.wv.key_to_index)
# print(len(model_5_100.wv.key_to_index))

file_name_5_100 = 'own_corpus_5_100.csv'
file_name_10_100 = 'own_corpus_10_100.csv'

# Own corpus window size 5 and 10 model guessing and analytics
# Corpus 5 100
create_model_details(file_name_5_100)
correct_guesses_5_100, random_guesses_5_100 = test_model(file_name_5_100, model_5_100.wv, questions, answers, choice1, choice2, choice3, choice4)

model_name_5_100 = 'own_corpus_5_100'
size_of_vocab_5_100 = len(model_5_100.wv.key_to_index)
number_of_correct_5_100 = correct_guesses_5_100
number_of_non_random_guess_5_100 = len(questions) - random_guesses_5_100
accuracy_5_100 = correct_guesses_5_100 / number_of_non_random_guess_5_100

write_model_analysis(model_name_5_100, size_of_vocab_5_100, number_of_correct_5_100, number_of_non_random_guess_5_100, accuracy_5_100)

#  Corpus 10 100
create_model_details(file_name_10_100)
correct_guesses_10_100, random_guesses_10_100 = test_model(file_name_10_100, model_10_100.wv, questions, answers, choice1, choice2, choice3, choice4)

model_name_10_100 = 'own_corpus_10_100'
size_of_vocab_10_100 = len(model_10_100.wv.key_to_index)
number_of_correct_10_100 = correct_guesses_10_100
number_of_non_random_guess_10_100 = len(questions) - random_guesses_10_100
accuracy_10_100 = correct_guesses_10_100 / number_of_non_random_guess_10_100

write_model_analysis(model_name_10_100, size_of_vocab_10_100, number_of_correct_10_100, number_of_non_random_guess_10_100, accuracy_10_100)


In [None]:
# Different embedding size 100 and 200
model_10_300 = Word2Vec(flat_list, window=10, vector_size=300, workers=4)
model_10_200 = Word2Vec(flat_list, window=10, vector_size=200, workers=4)

file_name_10_300 = 'own_corpus_10_300.csv'
file_name_10_200 = 'own_corpus_10_200.csv'

# Own corpus embedding size 300 and 200 model guessing and analytics
# Corpus 10 300
create_model_details(file_name_10_300)
correct_guesses_10_300, random_guesses_10_300 = test_model(file_name_10_300, model_10_300.wv, questions, answers, choice1, choice2, choice3, choice4)

model_name_10_300 = 'own_corpus_10_300'
size_of_vocab_10_300 = len(model_10_300.wv.key_to_index)
number_of_correct_10_300 = correct_guesses_10_300
number_of_non_random_guess_10_300 = len(questions) - random_guesses_10_300
accuracy_10_300 = correct_guesses_10_300 / number_of_non_random_guess_10_300

write_model_analysis(model_name_10_300, size_of_vocab_10_300, number_of_correct_10_300, number_of_non_random_guess_10_300, accuracy_10_300)

# Corpus 10 200
create_model_details(file_name_10_200)
correct_guesses_10_200, random_guesses_10_200 = test_model(file_name_10_200, model_10_200.wv, questions, answers, choice1, choice2, choice3, choice4)

model_name_10_200 = 'own_corpus_10_200'
size_of_vocab_10_200 = len(model_10_200.wv.key_to_index)
number_of_correct_10_200 = correct_guesses_10_200
number_of_non_random_guess_10_200 = len(questions) - random_guesses_10_200
accuracy_10_200 = correct_guesses_10_200 / number_of_non_random_guess_10_200

write_model_analysis(model_name_10_200, size_of_vocab_10_200, number_of_correct_10_200, number_of_non_random_guess_10_200, accuracy_10_200)