In [1]:
# import stuff and download stuff
from google.colab import drive
drive.mount('/content/drive')
from gensim import models
from gensim.models import KeyedVectors
import nltk.corpus
nltk.download('punkt_tab')
nltk.download('inaugural')
nltk.download('gutenberg')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
# get Bible text
text = nltk.corpus.gutenberg.raw(nltk.corpus.gutenberg.fileids()[3])

In [3]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

import string
from nltk.tokenize import sent_tokenize, word_tokenize

# split text into phrases
def get_phrases(text):
    words = nltk.tokenize.word_tokenize(text)
    phrases = {}
    current_phrase = []
    for word in words:
        if (word in stop_words or word in string.punctuation):
            if (len(current_phrase) > 1):
                phrases[" ".join(current_phrase)] = "_".join(current_phrase)
                current_phrase = []
        else:
            current_phrase.append(word)

    if (len(current_phrase) > 1):
        phrases[" ".join(current_phrase)] = "_".join(current_phrase)
    return phrases

# replae phrases with vocab
def replace_phrases(phrases_dict, text):
    for phrase in phrases_dict.keys():
        text = text.replace(phrase, phrases_dict[phrase])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# clean + tokenize data
import regex as re

text = re.sub(r'\d+', '', text)
phrases = get_phrases(text)
text = replace_phrases(phrases, text)
sentences = sent_tokenize(text)
words = [word_tokenize(sentence.lower()) for sentence in sentences]
model = models.Word2Vec(words, min_count=1)

In [5]:
model.train(words, total_examples=model.corpus_count, epochs=400)
model.save("/content/drive/MyDrive/Year 1/Fall Semester/4 Computing in Linguistics/Projects/word2vec_model.model")



In [6]:
question_words = "/content/drive/MyDrive/Year 1/Fall Semester/4 Computing in Linguistics/Projects/question_words.txt"
score, sections = model.wv.evaluate_word_analogies(question_words)
print(score)
# print(sections[-1])
# not very accurate! makes sense as the Bible does not cover a lot of English vocab, so it is bad at general analogies

0.04973474801061008


In [7]:
google_model = KeyedVectors.load_word2vec_format("/content/drive/MyDrive/Year 1/Fall Semester/4 Computing in Linguistics/Projects/GoogleNews-vectors-negative300.bin.gz", binary=True, limit=20000)

In [8]:
words = ["man", "woman", "father", "mother", "life", "faith", "fruit"]

for word in words:
  model_similar_words = model.wv.most_similar(word, topn=10)
  google_similar_words = google_model.most_similar(word, topn=10)
  print(word,"\n    Model:\n\t", model_similar_words, "\n    Google:\n\t", google_similar_words, "\n")

man 
    Model:
	 [('woman', 0.5005379915237427), ('he', 0.45827215909957886), ('prophet', 0.4581497013568878), ('a_man', 0.4545409679412842), ('man_shall', 0.4482245445251465), ('thing', 0.44818395376205444), ('him', 0.43323591351509094), ('murderer', 0.41519659757614136), ('truth_i_perceive', 0.41234090924263), ('man_also', 0.411765992641449)] 
    Google:
	 [('woman', 0.7664012908935547), ('boy', 0.6824871301651001), ('teenager', 0.6586930155754089), ('girl', 0.5921714305877686), ('robber', 0.5585119128227234), ('men', 0.5489763021469116), ('guy', 0.5420035123825073), ('person', 0.5342026352882385), ('gentleman', 0.5337990522384644), ('Man', 0.5316051244735718)] 

woman 
    Model:
	 [('child', 0.5246297121047974), ('man', 0.5005380511283875), ('maid', 0.47472789883613586), ('she', 0.45932668447494507), ('kid', 0.4375181794166565), ('herself', 0.4315888285636902), ('fetched_thence', 0.4273068904876709), ('husband', 0.4155438542366028), ('man_child', 0.40369293093681335), ('girdle', 

In [9]:
words = [["man", "woman"], ["woman", "man"], ["father", "mother"], ["mother", "father"], ["life", "faith"]]

for pair in words:
  model_similar_words = model.wv.most_similar(positive=pair[0], negative=pair[1], topn=10)
  google_similar_words = google_model.most_similar(positive=pair[0], negative=pair[1], topn=10)
  print("Positive:", pair[0], "  Negative:", pair[1], "\n    Model:\n\t", model_similar_words, "\n    Google:\n\t", google_similar_words, "\n")

Positive: man   Negative: woman 
    Model:
	 [('man_speaking', 0.4276359975337982), ('man_deceive', 0.406723290681839), ('private', 0.3974769413471222), ('work_therein', 0.3845498561859131), ('god_accepteth', 0.3683933615684509), ('doubt', 0.3655301332473755), ('man_think', 0.36264100670814514), ('wherefore_i_give', 0.357055127620697), ('more', 0.3472147285938263), ('evil_shall_come_upon', 0.3449440002441406)] 
    Google:
	 [('guy', 0.324423611164093), ('Man', 0.2959253191947937), ('Uncle', 0.29345017671585083), ('beard', 0.2869175970554352), ('mastermind', 0.27936387062072754), ('buddy', 0.27635231614112854), ('younger_brother', 0.2746973931789398), ('guys', 0.2735731899738312), ('genius', 0.27022987604141235), ('lad', 0.2643877863883972)] 

Positive: woman   Negative: man 
    Model:
	 [('yellow_gold', 0.4434618353843689), ('weavest', 0.4376771152019501), ('god_doth_talk', 0.40756234526634216), ('twofold', 0.4067041277885437), ('servant_took_rebekah', 0.4044879376888275), ('hath_wa