In [None]:
!pip install word2vec

In [None]:
!pip install pyjarowinkler

Collecting pyjarowinkler
  Downloading pyjarowinkler-1.8-py2.py3-none-any.whl (5.9 kB)
Installing collected packages: pyjarowinkler
Successfully installed pyjarowinkler-1.8


In [None]:
# imports
import os
import sys
import re
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader
import logging
import gzip
import json
from pyjarowinkler import distance

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Exploring the Data

In [None]:
# reading in the data file
filepath = "drive/MyDrive/Morphology Learning/"
lines = []
with open(os.path.join(filepath, "childes-input.txt")) as f:
  lines = f.readlines()

# Pre-Processing the Data

In [None]:
# strip whitespace, newlines, punctuation, set to lowercase
def preprocess_lines(line):
  # lowercase
  lower = line.lower()
  punc = re.sub(r'[^\w\s]', '', lower)
  # remove whitespace + newlines and convert to tokens
  cleaned = punc.strip().split()
  return cleaned

clean_lines = list(map(preprocess_lines, lines))

In [None]:
# clean_lines[:20]

In [None]:
# get some basic stats on the data
print("Lines: " + str(len(clean_lines)))
token_count = 0
unique_words = {}
for line in clean_lines:
  for word in line:
    token_count += 1
    if word not in unique_words:
      unique_words[word] = 1
    else:
      unique_words[word] += 1
print("Tokens: " + str(token_count))
print("Unique Tokens: " + str(len(unique_words)))

# dictionary tracking things -> unique_words

Lines: 2631544
Tokens: 11496792
Unique Tokens: 42234


In [None]:
# unique_words

# Selecting Similar Words

In [None]:
# semantics -- train word2vec (or any other representings) on the data
# build vocabulary & train the model
# can play around with model hyperparameters down the line too -- can continue to train by loading the model and calling train...
model = Word2Vec(sentences=clean_lines, sg=1, min_count=1) # using skip-gram, not cbow, default dimension is 100, default epochs is 5 (can increase this)

In [None]:
# save this model
model.save(os.path.join(filepath, 'base-word2vec-childes.model'))

2021-11-09 23:45:52,206 : INFO : saving Word2Vec object under drive/MyDrive/Morphology Learning/base-word2vec-childes.model, separately None
2021-11-09 23:45:52,208 : INFO : not storing attribute vectors_norm
2021-11-09 23:45:52,210 : INFO : not storing attribute cum_table
2021-11-09 23:45:52,979 : INFO : saved drive/MyDrive/Morphology Learning/base-word2vec-childes.model


In [None]:
# download and load a pretrained model -- 300 dimension word2vec embeddings trained on Google news
# could function as a kind of control??
# google_word2vec_model = gensim.downloader.load('word2vec-google-news-300') 
# fine tune on the sentences from childes (would this even make any difference?)
# google_word2vec_model.train(corpus_iterable=clean_lines)

In [None]:
# load the saved model
model = Word2Vec.load(os.path.join(filepath, 'base-word2vec-childes.model'))

2021-11-09 23:45:53,004 : INFO : loading Word2Vec object from drive/MyDrive/Morphology Learning/base-word2vec-childes.model
2021-11-09 23:45:53,461 : INFO : loading wv recursively from drive/MyDrive/Morphology Learning/base-word2vec-childes.model.wv.* with mmap=None
2021-11-09 23:45:53,463 : INFO : setting ignored attribute vectors_norm to None
2021-11-09 23:45:53,464 : INFO : loading vocabulary recursively from drive/MyDrive/Morphology Learning/base-word2vec-childes.model.vocabulary.* with mmap=None
2021-11-09 23:45:53,472 : INFO : loading trainables recursively from drive/MyDrive/Morphology Learning/base-word2vec-childes.model.trainables.* with mmap=None
2021-11-09 23:45:53,474 : INFO : setting ignored attribute cum_table to None
2021-11-09 23:45:53,476 : INFO : loaded drive/MyDrive/Morphology Learning/base-word2vec-childes.model


In [None]:
word_vectors = model.wv
word_vectors.save('childes-word2vec.wordvectors')

2021-11-09 23:45:58,281 : INFO : saving Word2VecKeyedVectors object under childes-word2vec.wordvectors, separately None
2021-11-09 23:45:58,283 : INFO : not storing attribute vectors_norm
2021-11-09 23:45:58,557 : INFO : saved childes-word2vec.wordvectors


In [None]:
# word_vectors['jump']
word_vectors.most_similar('jump', topn=25)

In [None]:
len(model.wv.vocab.keys())

42234

In [None]:
semantic_similarities = {}
missed = []
count = 0
for key in unique_words.keys():
  count += 1
  #if (count % 1000 == 0):
    #print("Processed '%d' unique tokens." % count)
  try:
    semantic_similarities[key] = word_vectors.most_similar(key, topn=25)
  except:
    print("Missed '%s'" % key)
    missed.append(key)

In [None]:
with open(os.path.join(filepath, 'childes-semantic-similarities.json'), "w") as f:
  json.dump(semantic_similarities, f, indent=4)

In [None]:
semantic_similarities

In [None]:
semantic_similarities["went"]

[('walked', 0.7681492567062378),
 ('came', 0.6991715431213379),
 ('drove', 0.6915683746337891),
 ('took', 0.6682751774787903),
 ('rushed', 0.6550699472427368),
 ('rode', 0.6549469828605652),
 ('ran', 0.6510108709335327),
 ('yesterday', 0.6326640844345093),
 ('traveled', 0.6226305961608887),
 ('go', 0.6135805249214172),
 ('crawled', 0.6061288714408875),
 ('wayland', 0.6043776273727417),
 ('hurried', 0.6028647422790527),
 ('followed', 0.5976724624633789),
 ('fell', 0.5917340517044067),
 ('hiking', 0.586167573928833),
 ('visited', 0.5850282311439514),
 ('wandered', 0.5827513933181763),
 ('chicago', 0.5802407264709473),
 ('sawed', 0.5800257921218872),
 ('spoke', 0.5792049169540405),
 ('saw', 0.5744475722312927),
 ('picadilly', 0.5737274885177612),
 ('jumped', 0.57276451587677),
 ('dashed', 0.5727537870407104)]

In [None]:
# morphology -- Levenshtein distance (should we consider Jaro-Winkler?)
# would only want to do this for words with already similar representations to cut down on the runtime??
# could also redo for all words and see if that outputs anything different... -> totally independent processes...
def levenshtein(token1, token2):
  distances = np.zeros((len(token1) + 1, len(token2) + 1))
  for t1 in range(len(token1) + 1):
      distances[t1][0] = t1
  for t2 in range(len(token2) + 1):
      distances[0][t2] = t2 
  a = 0
  b = 0
  c = 0
  for t1 in range(1, len(token1) + 1):
      for t2 in range(1, len(token2) + 1):
          if (token1[t1-1] == token2[t2-1]):
              distances[t1][t2] = distances[t1 - 1][t2 - 1]
          else:
              a = distances[t1][t2 - 1]
              b = distances[t1 - 1][t2]
              c = distances[t1 - 1][t2 - 1]  
              if (a <= b and a <= c):
                  distances[t1][t2] = a + 1
              elif (b <= a and b <= c):
                  distances[t1][t2] = b + 1
              else:
                  distances[t1][t2] = c + 1
  return distances[len(token1)][len(token2)] 

def calcDictDistance(word, comp_words, numWords):
    dictWordDist = []
    wordIdx = 0
    for comp_word in comp_words: 
        wordDistance = levenshtein(word, comp_word.strip())
        if wordDistance >= 10:
            wordDistance = 9
        dictWordDist.append(str(int(wordDistance)) + "-" + comp_word.strip())
        wordIdx = wordIdx + 1
    closestWords = []
    wordDetails = []
    currWordDist = 0
    dictWordDist.sort()
    #print(dictWordDist)
    for i in range(numWords):
        currWordDist = dictWordDist[i]
        wordDetails = currWordDist.split("-")
        closestWords.append(wordDetails[1])
    return closestWords  

In [None]:
def calcDictDistanceJaro(word, comp_words, numWords):
  dictWordDist = []
  wordIdx = 0
  for comp_word in comp_words: 
      wordDistance = distance.get_jaro_distance(word, comp_word, winkler=True, scaling=0.1)
      wordDistance = levenshtein(word, comp_word.strip())
      #if wordDistance >= 10:
          #wordDistance = 9
      dictWordDist.append(str(int(wordDistance)) + "-" + comp_word.strip())
      wordIdx = wordIdx + 1
  closestWords = []
  wordDetails = []
  currWordDist = 0
  dictWordDist.sort()
  #print(dictWordDist)
  for i in range(numWords):
      currWordDist = dictWordDist[i]
      wordDetails = currWordDist.split("-")
      closestWords.append(wordDetails[1])
  return closestWords

In [None]:
# test the above edit distance code on one example
# semantic_similarities['jump']
similar_words = [x[0] for x in semantic_similarities['jump']]
# print(similar_words)
calcDictDistance("jump", similar_words, 5)

['jumps', 'jumpin', 'hop', 'jumping', 'run']

In [None]:
calcDictDistanceJaro("jump", similar_words, 5)

['jumps', 'jumpin', 'hop', 'jumping', 'run']

In [None]:
# save similarities to file so that we do not need to re-run this repeatedly
word_form_similarities = {}
word_form_similarities_jaro = {}
count = 0
for word in semantic_similarities.keys():
  count += 1
  #if (count % 1000 == 0):
    #print("Processed '%d' unique tokens." % count)
  similar_words = [x[0] for x in semantic_similarities[word]]
  word_form_similarities[word] = calcDictDistance(word, similar_words, 10)
  word_form_similarities_jaro[word] = calcDictDistanceJaro(word, similar_words, 10)  

In [None]:
with open(os.path.join(filepath, 'childes-wordform-similarities.json'), "w") as f:
  json.dump(word_form_similarities, f, indent=4)

In [None]:
with open(os.path.join(filepath, 'childes-wordform-similarities-jaro.json'), "w") as f:
  json.dump(word_form_similarities_jaro, f, indent=4)

In [None]:
# word_form_similarities

# Applying Some Computational Linguistics Techniques

# Model to Train On Data

In [None]:
# based on the paper sent to me -- used to identify morphologically similar tokens
# model: https://github.com/cbelth/ATP-morphology
