In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import os
from glob import glob
from tqdm import tqdm
import pickle

## Preprocessing

In [3]:
DATA_DIR = "../data/wiki"

In [None]:
txt_files = glob(os.path.join(f"{DATA_DIR}/", "*.txt"))
lem = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(list(string.punctuation)) # punctutation
stop_words.update(["''", "``"]) # extra based on corpus

print(txt_files)

for txt in txt_files:
    with open(txt, "r+") as f:
        proc_lines = []
        lines = f.readlines()
        f.seek(0)
        for line in tqdm(lines, total = len(lines)):
            s_line = sent_tokenize(line)
            for l in s_line:
                proc_lines.append(" ".join([lem.lemmatize(w.lower(), 'v') for w in word_tokenize(l) if w not in stop_words]))
        proc_file = "\n".join(proc_lines)
        f.write(proc_file)

In [5]:
txt_files = glob(os.path.join(f"{DATA_DIR}/", "*.txt"))
print(txt_files)

dictionary = {}

for txt in txt_files:
    with open(txt, "r+") as f:
        lines = f.readlines()
        for line in tqdm(lines, total = len(lines)):
            for w in line.split():
                if dictionary.get(w, -1) == -1:
                    dictionary[w] = len(dictionary)
        f.close()

['../data/wiki/articles2.txt', '../data/wiki/articles3.txt', '../data/wiki/articles1.txt']



  0%|                                                                                                                 | 0/5376181 [00:00<?, ?it/s][A
  1%|▊                                                                                                | 48043/5376181 [00:00<00:11, 480397.24it/s][A
  2%|█▋                                                                                               | 96083/5376181 [00:00<00:11, 471456.65it/s][A
  3%|██▌                                                                                             | 143241/5376181 [00:00<00:11, 467642.59it/s][A
  4%|███▍                                                                                            | 190012/5376181 [00:00<00:11, 466170.90it/s][A
  4%|████▏                                                                                           | 237487/5376181 [00:00<00:10, 469226.63it/s][A
  5%|█████                                                                                         

In [6]:
print(len(dictionary))

3519590


In [2]:
MODEL_DIR = "../model"

In [3]:
with open(f"{MODEL_DIR}/dictionary.pth", "rb") as f:
    dictionary = pickle.load(f)

In [4]:
len(dictionary)

3519590

In [6]:
idx_word = {}
for k, v in tqdm(dictionary.items(), total = len(dictionary)):
    idx_word[v] = k

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3519590/3519590 [00:00<00:00, 4753644.93it/s]


In [7]:
len(idx_word)

3519590

In [8]:
with open(f"{MODEL_DIR}/idx_word.pth", "wb") as f:
    pickle.dump(idx_word,f, pickle.DEFAULT_PROTOCOL)

In [5]:
with open(f"{MODEL_DIR}/lines.pth", "rb") as g:
        proc_lines = pickle.load(g)

In [6]:
len(proc_lines)

5376181

In [7]:
proc_lines[4]

[1, 0, 33, 5, 34, 35, 36, 17, 24]

In [4]:
with open(f"{MODEL_DIR}/proc_train.pth", "rb") as g:
        train = pickle.load(g)

In [6]:
train[2]

[19, 21, 6, 22, 24, 26]

In [12]:
with open(f"{MODEL_DIR}/skipgram_data.pth", "wb") as g:
    pickle.dump(skipgram_data, g)

In [5]:
context_size = 2
cbow_data = []
for sent in tqdm(train, total = len(train), desc="Creating Training Data"):
    if len(sent) < (2*context_size + 1):
        continue
    for i in range(context_size, len(sent) - context_size):
        context = []
        for j in range(i-context_size, i+context_size+1):
            if i == j:
                continue
            context.append(sent[j])
        cbow_data.append([context, sent[i]])

with open(f"{MODEL_DIR}/cbow_data.pth", "wb") as g:
    pickle.dump(cbow_data, g)

Creating Training Data: 100%|██████| 5376181/5376181 [01:07<00:00, 79082.58it/s]


## GloVe

In [2]:
MODEL_DIR = "../model"

In [3]:
with open(f"{MODEL_DIR}/proc_train.pth", "rb") as g:
    data = pickle.load(g)

In [5]:
from scipy.sparse import lil_matrix
cooccurrences = lil_matrix((1684982, 1684982),dtype=np.float64)

In [6]:
context_size = 2
for sent in tqdm(data, total = len(data), desc="Filling COO Matrix"):
    for i in range(len(sent)):
        for j in range(max(i-context_size, 0), min(i+context_size+1, len(sent))):
            if i == j:
                continue
            cooccurrences[sent[i], sent[j]] += (1.0/abs(i-j))

Filling COO Matrix: 100%|███████████| 5376181/5376181 [14:40<00:00, 6105.21it/s]


In [3]:
with open(f"{MODEL_DIR}/coo.pth", "rb") as g:
    # pickle.dump(cooccurrences, g)
    coo = pickle.load(g)

EOFError: Ran out of input