In [1]:
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.datasets import fetch_20newsgroups

<li> Downloading the Autos data set 

In [2]:
categories = ['rec.autos']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

<li> Download the stopwords

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/VKY/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
corpus[0]

'From: mad9a@fermi.clas.Virginia.EDU (Michael A. Davis)\nSubject: Slick 50, any good?\nOrganization: University of Virginia\nLines: 9\n\n\n     Chances are that this has been discussed to death already, and\nif so could someone who has kept the discussion mail me or direct me \nto an archive site. Basically,\nI am just wondering if Slick 50 really does all it says that it does.\nAnd also, is there any data to support the claim.  Thanks for any info.\n\nMike Davis\nmad9a@fermi.clas.virginia.edu\n'

<li> Converting all words to lower case in the Corpus

In [5]:
corpus = [word.lower() for word in corpus]

<li> Add useless words into the stopwords list

In [35]:
stopset = set(stopwords.words("english"))
stopset.update(['\n', 'lines', 'from', 'thanks', 'for', 'info', 'and', 'this', 're', 'with', 'only', 'lots', 'room',
                'that', 'subject', 'organization', 'if', 'so', 'it', 'does', 'any', 'even', 'very', 'was', 'around',
                'are', 'me', 'to', 'says', 'could', 'an','death', 'already', 'basically', 'i', 'am', 'just', 'is', '>'
                'there', 'here', 'but', 'waiiiiiit', 'guess', 'think', 'as', 'far', 'actually', 'may', 'little', '>>', 
               'edu', 'also', 'university', '00', '05', 'gmt', 'really', 'like', 'well', '0400', 'say', 'se', 'ca', 
               'one', 'lot', 'com', 'uiuc', 'uoknor', 'writes', 'acs', 'ohio', 'uoknox', 'manta', 'opel', 'would', 
               'gt', 'cso', 'good', 'acs', 'like', 'usa', 'host', 'andrew', 'also', 'one', 'get', 'uokmax', '16', '000',
               'nntp', 'sho', 'cwru', 'oriolefan', 'uxa', '000601', 'still', 'news'])

<li> Vectorizing the Corpus

In [36]:
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1, 3))


<li> Creating a matrix for the vectorized corpus

In [37]:
X = vectorizer.fit_transform(corpus)

In [38]:
X[0]

<1x169513 sparse matrix of type '<class 'numpy.float64'>'
	with 73 stored elements in Compressed Sparse Row format>

In [22]:
print(X[0])

  (0, 47178)	0.107328038982
  (0, 97098)	0.107328038982
  (0, 38400)	0.101288573553
  (0, 146397)	0.101288573553
  (0, 46998)	0.101288573553
  (0, 8378)	0.101288573553
  (0, 138469)	0.101288573553
  (0, 167498)	0.101288573553
  (0, 137939)	0.101288573553
  (0, 18746)	0.101288573553
  (0, 50986)	0.101288573553
  (0, 92621)	0.101288573553
  (0, 51457)	0.101288573553
  (0, 83843)	0.101288573553
  (0, 139761)	0.101288573553
  (0, 51419)	0.101288573553
  (0, 36400)	0.101288573553
  (0, 162341)	0.107328038982
  (0, 8441)	0.107328038982
  (0, 138474)	0.107328038982
  (0, 47180)	0.107328038982
  (0, 96763)	0.107328038982
  (0, 162362)	0.101288573553
  (0, 38490)	0.101288573553
  (0, 62682)	0.202577147105
  :	:
  (0, 38489)	0.154631494383
  (0, 62681)	0.202577147105
  (0, 92210)	0.202577147105
  (0, 97088)	0.0680188723273
  (0, 38366)	0.0669912128473
  (0, 146389)	0.0746000394349
  (0, 46950)	0.0697069206041
  (0, 167441)	0.0585144618707
  (0, 137925)	0.0833552126206
  (0, 18738)	0.084924573779

In [39]:
X.shape

(990, 169513)

<li> Decomposing the matrix X 

In [40]:
lsa = TruncatedSVD(n_components=10, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=100,
       random_state=None, tol=0.0)

In [41]:
lsa.components_[0]

array([ 0.00047084,  0.00047084,  0.00047084, ...,  0.00042614,
        0.00060031,  0.00060031])

<li> Deriving concepts from the matrix

In [42]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i)
    for term in sortedTerms:
        print (term[0])
    print(" ")

Concept 0:
car
article
cars
engine
callison
know
oil
posting
new
state
 
Concept 1:
article
v6
kadett
craig
eliot
v12
cka52397
v8
engr washington
sfu
 
Concept 2:
posting
speed
car
dealer
000758 53229
virginia
since
new
mid
us
 
Concept 3:
much
oil
new
000710 27408
000337 10096 arc
go
heard
washington
speed
cs
 
Concept 4:
cars
work
000710 27408
000710
year
price
car
small
v12
000337 10096 arc
 
Concept 5:
car
james
000758
engine
ford
probe
much
world
pretty
model
 
Concept 6:
engine
something
000337 10096 arc
autos
back
car
000710 27408
posting
hand
years
 
Concept 7:
engine
right
drive
car
speed
000710
ecn
make
washington
article
 
Concept 8:
car
distribution
000758
s4
tires
people
world
turbo
callison
oil
 
Concept 9:
car
posting
computer
alarm
eliot
insurance
000710
work
people
last
 


<li> Looks like they were talking about cars/engine/oil/speed in some state/dealer/virginia 
with make/tires/distribution/insurance from some people/posting/computer/world