In [13]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_csv('articles1.csv', usecols=['content'])

In [3]:
df.head(3)

Unnamed: 0,content
0,WASHINGTON — Congressional Republicans have...
1,"After the bullet shells get counted, the blood..."
2,"When Walt Disney’s “Bambi” opened in 1942, cri..."


In [4]:
cv = CountVectorizer()

In [5]:
X_train = cv.fit_transform(['one two three', 'one two three four'])

In [6]:
X_train.T.shape

(16, 3)

In [12]:
svds(X_train.T.astype('float32'), k=2)

(array([[-4.08248276e-01,  9.24347177e-09],
        [ 1.29583695e-08, -6.05044425e-01],
        [-1.09517536e-07, -1.69882208e-01],
        [-4.08248276e-01,  9.24347177e-09],
        [-4.08248276e-01,  9.24347177e-09],
        [-1.09517536e-07, -1.69882208e-01],
        [-4.08248276e-01,  9.24347177e-09],
        [ 6.12379480e-08, -2.17581093e-01],
        [ 6.12379480e-08, -2.17581093e-01],
        [ 6.12379480e-08, -2.17581093e-01],
        [-1.57797118e-07, -5.57345510e-01],
        [ 6.12379480e-08, -2.17581093e-01],
        [-1.09517536e-07, -1.69882208e-01],
        [ 6.12379480e-08, -2.17581093e-01],
        [-4.08248276e-01,  9.24347177e-09],
        [-4.08248276e-01,  9.24347177e-09]], dtype=float32),
 array([2.4494898, 3.6225827], dtype=float32),
 array([[ 1.5000174e-07, -1.0000000e+00, -2.6826208e-07],
        [-7.8820550e-01,  3.3485239e-08, -6.1541235e-01]], dtype=float32))

In [14]:
svd = TruncatedSVD(n_components=2, random_state=1)

In [15]:
svd.fit(X_train.T)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5, random_state=1,
       tol=0.0)

In [23]:
svd.components_

array([[ 0.78820544, -0.        ,  0.61541221],
       [ 0.        ,  1.        ,  0.        ]])

In [30]:
cv = CountVectorizer()

In [32]:
vectorized = cv.fit_transform(df['content'])

In [51]:
svd = TruncatedSVD(n_components=200, random_state=1)

In [52]:
svd.fit(vectorized)

TruncatedSVD(algorithm='randomized', n_components=200, n_iter=5,
       random_state=1, tol=0.0)

In [63]:
svd.components_.shape

(200, 174211)

In [58]:
dd = svds(vectorized.T.astype('float32'), k=50)

In [74]:
(vectorized @ np.ones((174211, 2))).shape

(50000, 2)

In [75]:
vectorized.shape

(50000, 174211)

In [1]:
import pandas as pd
from document_similarity import LatentSemanticAnalysis

df = pd.read_csv('articles1.csv', usecols=['content'])

lsa = LatentSemanticAnalysis()
lsa.fit(df.head(1000).content, n_components=300)

In [2]:
most_similar = lsa.predict_most_similar(df.content[28], df.head(30000).content)

In [3]:
most_similar

([28, 3734, 21695, 7263, 5480, 8753, 4696, 3870, 5022, 8172],
 [1.0,
  0.9587763221152563,
  0.9527564733475511,
  0.9525364446602073,
  0.9516073311011749,
  0.9504881067212242,
  0.9502729535327323,
  0.9494314541841468,
  0.9487381458514401,
  0.9482622209966494])