In [1]:
import spacy
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [2]:
import pandas as pd
import pickle
ds = pd.read_csv('../../beer_data/beer_ds.csv')

In [3]:
texts = ds['review/text']

In [4]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

lmtzr = WordNetLemmatizer()
stop = stopwords.words('english')

In [5]:
import re
sents = []
for i, text in enumerate(texts) :
    sents.append(nlp(re.sub(r"\s+", ' ', text)).sents)
    if i % 5000 == 0 :
        print(i)

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000


In [6]:
breaks = []
for i, s in enumerate(sents) :
    changed = []
    for g in s :
        tokens = [x.text.lower() for x in g]
        tokens = [re.sub('[^a-zA-Z0-9]', '', x).strip() for x in tokens]
        tokens = [x for x in tokens if len(x) > 0]
        tokens = [x for x in tokens if x not in stop]
        tokens = [lmtzr.lemmatize(w) for w in tokens]
        changed.append(tokens)
    breaks.append(changed)
    if i % 5000 == 0 :
        print(i)

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000


In [28]:
lowercase_sents = []
for sent in breaks :
    if len(sent) <= 100 and len(sent) > 2 :
        lowercase_sents.append(sent)

In [7]:
from collections import defaultdict
training_data = breaks
vocab = defaultdict(int)
for doc in training_data :
    for line in doc :
        for word in line :
            vocab[word] += 1

In [32]:
filter_vocab = {k:v for k, v in vocab.items() if not any(char.isdigit() for char in k)}
sorted_vocab = sorted(filter_vocab.items(), key=lambda x:-x[1])
top10k = sorted_vocab[:10000]
vocab_10k = {k:v for k,v in top10k}

In [34]:
word2idx = {'[0]' : 0, '<unk>' : 1, '<qqq>' : 2}
idx = len(word2idx)
for word in vocab_10k :
    word2idx[word] = idx
    idx+=1

In [35]:
idx_train = []
for i, line in enumerate(training_data) :
    nline = []
    for word in line :
        if any(char.isdigit() for char in word) :
            nline.append(2)
        elif word in word2idx :
            nline.append(word2idx[word])
        else :
            nline.append(1)
    idx_train.append(nline)
    if i%100000 == 0:
        print(i)

0
100000
200000
300000
400000
500000
600000
700000
800000


In [36]:
idx2word = {v:k for k,v in word2idx.items()}

In [37]:
data = {'word2idx' : word2idx, 'idx2word' : idx2word, 'training' : idx_train}

In [38]:
from gensim.models import Word2Vec
model = Word2Vec.load('../../beer_data/embedding_full.mod')

In [39]:
import numpy as np
embeddings = np.zeros((len(data['word2idx']), 200))

In [45]:
for word in data['word2idx'] :
    if word in model.vocab :
        embeddings[data['word2idx'][word], :] = list(model[word])
    else :
        embeddings[data['word2idx'][word], :] = np.random.randn(200)

In [46]:
data['embeddings'] = embeddings
pickle.dump(data, open('data_beer_lemm_90K.p', 'wb'))

In [48]:
for word in data['word2idx'] :
    if word in model.vocab :
        assert all(embeddings[data['word2idx'][word]] == list(model[word]))

In [8]:
data = pickle.load(open('data_beer_lemm_90K.p', 'rb'))

In [11]:
idx_train = []
word2idx = data['word2idx']
for i, doc in enumerate(breaks) :
    lines = []
    for line in doc :
        nline = []
        for word in line :
            if any(char.isdigit() for char in word) :
                nline.append(2)
            elif word in word2idx :
                nline.append(word2idx[word])
            else :
                nline.append(1)
        lines.append(nline)
    idx_train.append(lines)
    if i%100000 == 0:
        print(i)

0


In [14]:
data['training'] = idx_train
pickle.dump(data, open('data_beer_lemm_90K_sep.p', 'wb'))