In [1]:
from keras.preprocessing.sequence import make_sampling_table, skipgrams
from keras.preprocessing.text import Tokenizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix,lil_matrix
from matplotlib import pyplot as plt
import numpy as np

Using TensorFlow backend.


In [2]:
TNG_train = fetch_20newsgroups(subset="train",remove=('headers', 'footers', 'quotes'))

In [3]:
tokenizer = Tokenizer(num_words=10000)

In [4]:
tokenizer.fit_on_texts(TNG_train.data)

In [5]:
seqs=tokenizer.texts_to_sequences(TNG_train.data)

In [6]:
seqs[0][0:10]

[7, 26, 1383, 24, 160, 64, 43, 105, 61, 15]

In [7]:
V = 10001

In [8]:
V

10001

In [9]:
data, labels = skipgrams(sequence=seqs[0], vocabulary_size=V, window_size=5, negative_samples=0, sampling_table=make_sampling_table(V, sampling_factor=1e-5), shuffle=False)

In [10]:
data

[[1383, 7],
 [1383, 26],
 [1383, 24],
 [1383, 160],
 [1383, 64],
 [1383, 43],
 [1383, 105],
 [262, 312],
 [262, 7],
 [262, 587],
 [262, 1],
 [262, 73],
 [262, 11],
 [262, 26],
 [262, 5],
 [262, 29],
 [262, 1282],
 [2507, 11],
 [2507, 26],
 [2507, 5],
 [2507, 29],
 [2507, 1282],
 [2507, 312],
 [2507, 986],
 [2507, 3],
 [2507, 16],
 [2507, 30],
 [3721, 11],
 [3721, 26],
 [3721, 288],
 [3721, 5],
 [3721, 1],
 [3721, 68],
 [3721, 172],
 [3721, 408],
 [3721, 8],
 [3721, 1107],
 [3069, 38],
 [3069, 5],
 [3069, 657],
 [3069, 236],
 [3069, 959],
 [3069, 179],
 [3069, 4],
 [3069, 2570],
 [3069, 141],
 [3069, 14],
 [14, 3069],
 [14, 179],
 [14, 4],
 [14, 2570],
 [14, 141],
 [14, 312],
 [14, 9],
 [14, 201],
 [14, 533],
 [14, 23],
 [357, 352],
 [357, 13],
 [357, 18],
 [357, 15],
 [357, 14],
 [357, 312],
 [357, 167],
 [357, 101],
 [357, 206],
 [167, 18],
 [167, 15],
 [167, 14],
 [167, 357],
 [167, 312],
 [167, 101],
 [167, 206]]

In [11]:
counts_matrix=lil_matrix((V-1, V-1))

In [12]:
seqs=np.hstack(seqs)
seqs.shape

(2153559,)

In [13]:
seqs=seqs.astype(int)

In [14]:
pairs, labels = skipgrams(sequence=list(seqs), vocabulary_size=V, window_size=5, negative_samples=0, sampling_table=make_sampling_table(V, sampling_factor=1), shuffle=False)


In [15]:
pairs_u, counts = np.unique(pairs,return_counts=True, axis=0)

In [16]:
len(pairs_u)

4127751

In [17]:
for num,(pair,count) in enumerate(zip(pairs_u, counts)):
    if num%1000 ==0:
        print(f'\r{num}', end="")
    counts_matrix[pair[0],pair[1]] += count

4127000

In [18]:
counts_matrix.shape

(10000, 10000)

In [19]:
red = TruncatedSVD(n_components=100)
TNG_cv_red = red.fit_transform(counts_matrix)

In [20]:
TNG_cv_red.shape

(10000, 100)

In [21]:
from sklearn.neighbors import NearestNeighbors

In [22]:
neigh = NearestNeighbors(n_neighbors=20, metric="cosine")

In [23]:
neigh.fit(TNG_cv_red)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [24]:
tokenizer.word_index["car"]

312

In [25]:
idx_1 = tokenizer.word_index["york"]
_, neig = neigh.kneighbors(TNG_cv_red[idx_1:idx_1+1])

In [26]:
for n in neig[0]:
    print(tokenizer.index_word[n])

york
zealand
washington
florida
1989
london
san
england
francisco
1982
mexico
1992
1988
december
1991
texas
1986
1987
canada
1985


# PPMI

In [27]:
counts_matrix

<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 4127751 stored elements in LInked List format>

In [28]:
PMI = counts_matrix[1:,1:].sum(axis=1)/counts_matrix[1:,1:].sum()

In [29]:
PMI.shape

(9999, 1)

In [30]:
probs = counts_matrix[1:,1:]/counts_matrix[1:,1:].sum()

In [31]:
PMI = probs/np.dot(PMI, PMI.T)

In [32]:
PMI

matrix([[ 1.06097522,  0.        ,  1.1419468 , ...,  0.        ,
          0.        ,  0.12685802],
        [ 0.        , 28.21116313,  0.        , ...,  1.95645818,
          1.52168969,  1.30430545],
        [ 1.1468787 ,  0.        ,  0.87349523, ...,  0.        ,
          0.5069733 ,  0.        ],
        ...,
        [ 0.        ,  1.95645818,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  1.52168969,  0.5069733 , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.12685802,  1.30430545,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

In [33]:
PMI=np.array(PMI)
PMI

array([[ 1.06097522,  0.        ,  1.1419468 , ...,  0.        ,
         0.        ,  0.12685802],
       [ 0.        , 28.21116313,  0.        , ...,  1.95645818,
         1.52168969,  1.30430545],
       [ 1.1468787 ,  0.        ,  0.87349523, ...,  0.        ,
         0.5069733 ,  0.        ],
       ...,
       [ 0.        ,  1.95645818,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.52168969,  0.5069733 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.12685802,  1.30430545,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [297]:
PMI=PMI*(PMI>1)+1e-8

In [298]:
PMI = np.log(PMI)

In [300]:
PMI=PMI*(PMI>0)+1e-8

In [301]:
PMI

array([[5.89901343e-02, 1.00000000e-08, 1.33537727e-01, ...,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
       [1.00000000e-08, 3.33971777e+00, 1.00000000e-08, ...,
        6.71135800e-01, 4.19821373e-01, 2.65670695e-01],
       [1.37044101e-01, 1.00000000e-08, 1.00000000e-08, ...,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
       ...,
       [1.00000000e-08, 6.71135800e-01, 1.00000000e-08, ...,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
       [1.00000000e-08, 4.19821373e-01, 1.00000000e-08, ...,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
       [1.00000000e-08, 2.65670695e-01, 1.00000000e-08, ...,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08]])

In [302]:
red = TruncatedSVD(n_components=300)
TNG_cv_red = red.fit_transform(PMI)

MemoryError: 

In [266]:
TNG_cv_red.shape

(10000, 300)

In [267]:
from sklearn.neighbors import NearestNeighbors

In [268]:
neigh = NearestNeighbors(n_neighbors=20, metric="cosine")

In [269]:
neigh.fit(TNG_cv_red)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [270]:
tokenizer.word_index["car"]

312

In [271]:
idx_1 = tokenizer.word_index["york"]
_, neig = neigh.kneighbors(TNG_cv_red[idx_1:idx_1+1])

In [272]:
for n in neig[0]:
    print(tokenizer.index_word[n])

york
zealand
jersey
brand
mexico
haven
testament
england
publishers
height
avenue
publisher
technologies
mets
islanders
212
devils
metro
london
washington
