In [7]:
import pickle
from tqdm.auto import tqdm
import re
import glob
import random
from collections import Counter

from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

## Create vocab and co-occurrence matrix

In [3]:
PUNCTUATION = {
    'sep'   : u'\u200b' + "/-'´′‘…—−–",
    'keep'  : "&",
    'remove': '?!.,，"#$%\'()*+-/:;<=>@[\\]^_`{|}~“”’™•°'
}


def clean_text(x):
    x = x.lower()

    for p in PUNCTUATION['sep']:
        x = x.replace(p, " ")
    for p in PUNCTUATION['keep']:
        x = x.replace(p, " %s " % p)
    for p in PUNCTUATION['remove']:
        x = x.replace(p, "")

    return x

In [8]:
def get_all_words(data_source: str):
    """source: path to a directory containg files, or to a file containing paths"""

    all_words = []

    if os.path.isdir(data_source):
        listing = sorted(glob.glob('%s/**/*.txt' % data_source, recursive=True))
    else:  # regular file
        listing = [l.strip() for l in open(data_source).readlines()]

    word_predicate = lambda w: re.match(r'[\w]+', w) and w not in stopWords

    for file in tqdm(listing, desc="Get all words"):
        # TODO: use something smarter (spacy / nltk)
        lines = [clean_text(l.strip().lower()) for l in open(file, "rt").readlines()]
        all_words += [w for line in lines for w in line.split() if word_predicate(w)]

    return all_words


def create_vocab_counter(words):
    vocab = Counter()
    for w in tqdm(words):
        vocab[w] += 1

    # print("len(vocab) = %d" % len(vocab))
    return vocab

# ---

data_source = '../corpus/python-3.7.3-docs-text/'

vocab_size = 20000
window_size = 5

all_words = get_all_words(data_source)
print("len(all_words) = %d" % len(all_words))

top_words, top_freqs = zip(*create_vocab_counter(all_words).most_common()[:vocab_size])

word2idx = {w: i for i, w in enumerate(top_words)}

co_occur_mat = np.zeros((vocab_size, vocab_size), dtype=np.uint16)

top_words_set = set(top_words)

for i in tqdm(range(len(all_words)), desc='Constructing co-occurrence matrix'):
    if all_words[i] not in top_words_set: continue

    # window-search
    for j in range(max(i - window_size, 0), min(i + window_size, len(all_words))):
        if i == j or all_words[j] not in top_words_set: continue
        co_occur_mat[word2idx[all_words[i]], word2idx[all_words[j]]] += 1
# ---

HBox(children=(IntProgress(value=0, description='Get all words', max=474, style=ProgressStyle(description_widt…


len(all_words) = 785676


HBox(children=(IntProgress(value=0, max=785676), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Constructing co-occurrence matrix', max=785676, style=Progres…




In [15]:
v = pickle.load(open("/home/alex/Desktop/pydoc-window-5-size-20000.vocab", "rb"))

v.most_common(10)

[('python', 7367),
 ('bpo', 6578),
 ('object', 5628),
 ('module', 5437),
 ('return', 5330),
 ('new', 4751),
 ('function', 4750),
 ('file', 4579),
 ('value', 4395),
 ('class', 4367)]

## Testing

In [None]:
pt_emb_file = '../embeddings/glove.840B.300d.txt.pickle'
ft_emb_file = '../embeddings/pydoc-glove-fine-tuned-vocab-20000-window-5-iter-5000'
vocab_file = '../embeddings/pydoc-vocab-20000-window-5.vocab'
mat_file = '../embeddings/pydoc-vocab-20000-window-5.mat'

ft_factor = 0.7
pt_factor = 0.3

In [None]:
glove_emb = pickle.load(open(pt_emb_file, "rb"))
orig_glove_emb = pickle.load(open(pt_emb_file, "rb"))

ft_glove_emb_arr = pickle.load(open(ft_emb_file, "rb"))
vocab = pickle.load(open(vocab_file, "rb"))
mat = pickle.load(open(mat_file, 'rb'))

ft_glove_emb = {w: ft_glove_emb_arr[i] for w, i in vocab.items()}

x = 0
for w in tqdm(ft_glove_emb, desc="Mixing embeddings (ft %.2f, pt %.2f)" % (ft_factor, pt_factor)):
    if w not in glove_emb:
        glove_emb[w] = ft_glove_emb[w]
        x += 1
    else:
        glove_emb[w] = ft_factor * ft_glove_emb[w] + pt_factor * glove_emb[w]
        
        
print(x/len(ft_glove_emb))

In [None]:
sim = lambda x, y: np.dot(x, y) #/ (np.linalg.norm(x) * np.linalg.norm(y))

w1 = 'return'
w2 = 'function'

print(sim(glove_emb[w1], glove_emb[w2]))
print(sim(orig_glove_emb[w1], orig_glove_emb[w2]))

# np.log(mat[vocab[w1], vocab[w2]]) - np.dot(glove_emb[w1], glove_emb[w2])