#Next entity recommendation using Spacy noun chunks and LSTM

Steps:
1. Extract noun chunks using Spacy
2. Filter out most frequent
3. Cluster dialogues using chunks vector and Kmeans
4. Take the largest cluster and split it to train and test
5. Train LSTM model to generate next entity (noun chunk)

In [1]:
import numpy as np
import spacy
import torch

from torch.utils.data import DataLoader
from sklearn.cluster import KMeans

from data_tools import Chunker, Vectorizer, SequenceGenerator
from torch_tools import TopicDataset, GModel, train, predict

In [2]:
spacy.prefer_gpu()
# load larger model to use spacy word vectors
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [3]:
device = torch.device('cuda')

## Data

In [4]:
# %%bash
# """ data """
# wget http://yanran.li/files/ijcnlp_dailydialog.zip
# unzip ijcnlp_dailydialog.zip
# mv ijcnlp_dailydialog data

In [5]:
"""
work only with a train set only as we were supposed to filter out topics 
using key words not topic labels
"""
with open('data/dialogues_text.txt') as file:
    dialogues = file.readlines()

In [6]:
dialogues[0]

'Can I help you sir , what do you need ? __eou__ I need a packet of cigarettes please . __eou__ Of course sir , no problem . __eou__ Thanks . __eou__\n'

normalizing data

In [7]:
chunker = Chunker(spacy_model=nlp, stop_words=nlp.Defaults.stop_words)
chunked_dials = chunker.normalize(dialogues[0:1000])

  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
chunked_dials[90]

[Ve,
 your computer,
 Ve,
 your personal files,
 ’ m,
 all my important personal documents,
 that computer,
 no laughing matter,
 I ’ m,
 Don ’ t,
 my computer,
 t]

In [9]:
most_common = [topic[0] for topic in chunker.counter.most_common(10)]
filtered = [chunker.filter_chunks(dial, most_common) for dial in chunked_dials]
set(filtered[90]) ^ set(chunked_dials[90])

set()

In [10]:
# filter out dialogues which have less than three noun chunks
# for prediction, I will use two previous noun chunks
filtered = [doc for doc in filtered if len(doc) > 1]

Vectorize

In [11]:
vectorizer = Vectorizer(len(filtered))
vecs, doc2id = vectorizer.vectorize(filtered)

In [12]:
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(vecs)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [13]:
unique, counts = np.unique(kmeans.labels_, return_counts=True)
topics = dict(zip(unique, counts))
topics

{0: 99, 1: 173, 2: 222, 3: 134, 4: 276}

In [14]:
id_max = max(topics, key=topics.get)
id_max

4

Take the largest cluster to have data for training

In [15]:
largest_theme = doc2id.loc[(kmeans.labels_== id_max).nonzero()]

In [16]:
largest_theme.sample(5)

Unnamed: 0,doc
1,"[(So, Dick), (some, coffee), (tonight), (Coffe..."
242,"[(your, dressing, room), (Person, A), (the, dr..."
256,"[(these, packages), (the, key), (Just, things)..."
569,"[(What, games), (word, games), (a, game), (bri..."
288,"[(pets), (this, apartment), (No, dogs), (cats)..."


Build X and y for training

In [17]:
sgen = SequenceGenerator(1)
seqs = sgen.get_sequences(largest_theme)
seqs.head()

Unnamed: 0,seq,target
0,1132,1619
1,602,1070
2,1070,1483
3,1483,1476
4,1476,312


train test split

In [18]:
train_df=seqs.sample(frac=0.8,random_state=42)
test_df=seqs.drop(train_df.index)

In [19]:
train_df.shape, test_df.shape

((1601, 2), (400, 2))

Dataset and DataLoader

In [20]:
# lstm accepts hidden_state and cell_state of the same size
# to avoid initiating it at each batch, make all batches of equal size
# alternative solution is to add up a few examples from validation
BSize = 4
trim_train = train_df.shape[0] % BSize
trim_test = test_df.shape[0] % BSize

train_x = train_df['seq'].values
train_y = train_df['target'].values
test_x = test_df['seq'].values
test_y = test_df['target'].values
if trim_train>0:
    train_x = train_x[:-trim_train]
    train_y = train_y[:-trim_train]
if trim_test>0:
    test_x = test_x[:-trim_test]
    test_y = test_y[:-trim_test]

train_dataset = TopicDataset(
    x=train_x,
    y=train_y,
    n_features=300,
    id2chunk=sgen.id2chunk,
    chunk2id=sgen.chunk2id,
    seq_len=sgen.seq_len)

test_dataset = TopicDataset(
    x=test_x,
    y=test_y,
    n_features=300,
    id2chunk=sgen.id2chunk,
    chunk2id=sgen.chunk2id,
    seq_len=sgen.seq_len)

In [21]:
# vector size               target                  prev_seq
train_dataset[0][0].shape, train_dataset[0][1],train_dataset[0][2]

((300,), 475, 596)

In [22]:
# vector size               target                  prev_seq
test_dataset[0][0].shape, test_dataset[0][1], test_dataset[0][2]

((300,), 1070, 602)

Split the dataset to train, val and test

In [23]:
train_loader = DataLoader(
    train_dataset, batch_size=BSize, 
    shuffle=True, collate_fn=train_dataset.collate,
    # TODO find out why shuffling requires a generator on gpu 
    # probably due to spacy vectors on gpu,
    # but shuffle=False works fine without it for some reason
    generator=torch.Generator(device='cuda'))

for batch in train_loader:
    break

batch[0].shape

torch.Size([4, 1, 300])

In [24]:
test_loader = DataLoader(
    test_dataset, batch_size=BSize, 
    shuffle=True, collate_fn=test_dataset.collate,
    generator=torch.Generator(device='cuda'))

In [25]:
for batch in test_loader:
    break

batch[1]

tensor([  96, 1822, 1389, 2028])

## Model

use lstm to generate next topic

In [26]:
model = GModel(vocab_size=len(sgen.id2chunk), seq_len=sgen.seq_len)

In [27]:
model.to(device)

GModel(
  (lstm): LSTM(300, 128, batch_first=True)
  (linear): Linear(in_features=128, out_features=2277, bias=True)
)

In [28]:
train(model, train_loader, epochs=10, lr=.001, clip_value=1.0)

Epoch 1:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 2:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 3:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 4:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 5:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 6:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 7:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 8:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 9:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 10:   0%|          | 0/400 [00:00<?, ?it/s]

In [30]:
predict(model, test_loader, sgen.id2chunk)

After --a try--, I suggest discussing --the temptation--
After --a good word--, I suggest discussing --a cheerful atmosphere--
After --How much money--, I suggest discussing --the lap--
After --that case--, I suggest discussing --some drinks--


In [31]:
predict(model, test_loader, sgen.id2chunk)

After --whole gangs--, I suggest discussing --the dog--
After --a ladder--, I suggest discussing --the playground--
After --strangers--, I suggest discussing --a good point--
After --a window seat--, I suggest discussing --school--
