In [136]:
from keras.preprocessing.sequence import make_sampling_table, skipgrams
from keras.preprocessing.text import Tokenizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix,lil_matrix
from matplotlib import pyplot as plt
import numpy as np

In [137]:
TNG_train = fetch_20newsgroups(subset="train",remove=('headers', 'footers', 'quotes'))

In [138]:
tokenizer = Tokenizer(num_words=10000)

In [139]:
tokenizer.fit_on_texts(TNG_train.data)

In [140]:
seqs=tokenizer.texts_to_sequences(TNG_train.data)

In [141]:
seqs[0][0:10]

[7, 26, 1383, 24, 160, 64, 43, 105, 61, 15]

In [142]:
V = 10001

In [143]:
V

10001

In [144]:
data, labels = skipgrams(sequence=seqs[0], vocabulary_size=10001, window_size=5, negative_samples=0, sampling_table=make_sampling_table(V, sampling_factor=1e-5), shuffle=False)

In [145]:
data

[[1383, 7],
 [1383, 26],
 [1383, 24],
 [1383, 160],
 [1383, 64],
 [1383, 43],
 [1383, 105],
 [105, 1383],
 [105, 24],
 [105, 160],
 [105, 64],
 [105, 43],
 [105, 61],
 [105, 15],
 [105, 14],
 [105, 312],
 [105, 7],
 [26, 587],
 [26, 1],
 [26, 73],
 [26, 262],
 [26, 11],
 [26, 5],
 [26, 29],
 [26, 1282],
 [26, 2507],
 [26, 312],
 [986, 5],
 [986, 29],
 [986, 1282],
 [986, 2507],
 [986, 312],
 [986, 3],
 [986, 16],
 [986, 30],
 [986, 1],
 [986, 1203],
 [1203, 986],
 [1203, 3],
 [1203, 16],
 [1203, 30],
 [1203, 1],
 [1203, 748],
 [1203, 11],
 [1203, 26],
 [1203, 288],
 [1203, 5],
 [3721, 11],
 [3721, 26],
 [3721, 288],
 [3721, 5],
 [3721, 1],
 [3721, 68],
 [3721, 172],
 [3721, 408],
 [3721, 8],
 [3721, 1107],
 [1107, 3721],
 [1107, 68],
 [1107, 172],
 [1107, 408],
 [1107, 8],
 [1107, 1],
 [1107, 799],
 [1107, 7710],
 [1107, 26],
 [1107, 1604],
 [7710, 408],
 [7710, 8],
 [7710, 1107],
 [7710, 1],
 [7710, 799],
 [7710, 26],
 [7710, 1604],
 [7710, 30],
 [7710, 1],
 [7710, 691],
 [1604, 1107]

In [147]:
counts_matrix=lil_matrix((V, V))

In [148]:
seqs=np.hstack(seqs)
seqs.shape

(2153559,)

In [149]:
seqs=seqs.astype(int)

In [175]:
pairs, labels = skipgrams(sequence=list(seqs), vocabulary_size=V, window_size=5, negative_samples=0, sampling_table=make_sampling_table(V, sampling_factor=1e-3), shuffle=False)


In [176]:
pairs_u, counts = np.unique(pairs,return_counts=True, axis=0)

In [177]:
len(pairs_u)

3954427

In [178]:
for num,(pair,count) in enumerate(zip(pairs_u, counts)):
    if num%1000 ==0:
        print(f'\r{num}', end="")
    counts_matrix[pair[0],pair[1]] += count

3954000

In [179]:
counts_matrix[275,1].sum()

431.0

In [180]:
red = TruncatedSVD(n_components=300)
TNG_cv_red = red.fit_transform(counts_matrix.T)

In [181]:
TNG_cv_red.shape

(10001, 300)

In [182]:
from sklearn.neighbors import NearestNeighbors

In [183]:
neigh = NearestNeighbors(n_neighbors=20, metric="cosine")

In [184]:
neigh.fit(TNG_cv_red)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [185]:
tokenizer.word_index["car"]

312

In [186]:
idx_1 = tokenizer.word_index["car"]
_, neig = neigh.kneighbors(TNG_cv_red[idx_1:idx_1+1])

In [187]:
for n in neig[0]:
    print(tokenizer.index_word[n])

car
bike
my
a
and
mine
still
driving
getting
engine
it
tires
big
just
with
great
around
while
head
for
