## Latent Symantic Analysis
## Author : Gaurav Khandave
## Version : 1.0
## Date : 09/15/2016

In [1]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [3]:
corpus[1]

u'From: schmke@cco.caltech.edu (Kevin Todd Schmidt)\nSubject: NL OPI through first week+\nOrganization: California Institute of Technology, Pasadena\nLines: 184\nNNTP-Posting-Host: sandman.caltech.edu\n\nHere is the OPI (Offensive Production Index) for all NL players with at\nleast 10 at-bats.\n\nIt is early in the season so there are some high numbers.  Barry Bonds\nfinished last season at 0.795.\n\nI welcome comments and suggestions.\n\nKevin\n\nLeague OPI: 0.410\nLeague BA:  0.252\nLeague SLG: 0.375\nLeague OBA: 0.321\n\nRank Player                 OPI     BA    SLG    OBA\n-----------------------------------------------------\n1    Phi,daulton           1.101  0.333  0.875  0.515\n2    Phi,kruk              1.069  0.429  0.821  0.529\n3    Cub,grace             1.007  0.452  0.742  0.514\n4    Cub,may               0.931  0.389  0.889  0.421\n5    Col,boston            0.888  0.545  0.545  0.545\n6    Pit,bell              0.873  0.429  0.714  0.467\n7    Col,galarraga         0.86

#### There are lots of punctuations and numbers which we dont require. We will remove those by using following code :

In [4]:
import re
# Regex for punctuations and numbers.
punctuation = re.compile(r'[-.*=?!+/,":;()\n<>_|0-9]')
# Removing punctuactions and numbers
corpus = [punctuation.sub("", document) for document in corpus]

#### Now we can see there are lots of email ids in a document of corpus. We need to remove thode unwanted email ids.
<li> We will first split each word in the document of corpus.
<li> If there is any '@' symbol present in the word, the word will me removed.
<li> Using this strategy we will remove unwanted email ids.

In [5]:
filteredString = ""

# Gets each document with index in a corpus
for i,document in enumerate(corpus,start=0):
    # splits document into words
    wordlist = document.split(" ")
    for index,word in enumerate(wordlist,start=0):
        # Check if word has "@" in word, if there then pop the word
        if '@' in word:
            wordlist.pop(index)
            
    # After removing emailIds, join the wordlist to form an entire string
    filteredString = " ".join(wordlist)
    
    # Replace filteredString with the current document in the corpus
    corpus[i]=filteredString

In [6]:
corpus[1]

u'From Kevin Todd SchmidtSubject NL OPI through first weekOrganization California Institute of Technology PasadenaLines NNTPPostingHost sandmancaltecheduHere is the OPI Offensive Production Index for all NL players with atleast  atbatsIt is early in the season so there are some high numbers  Barry Bondsfinished last season at I welcome comments and suggestionsKevinLeague OPI League BA  League SLG League OBA Rank Player                 OPI     BA    SLG    OBA    Phidaulton                     Phikruk                        Cubgrace                       Cubmay                         Colboston                      Pitbell                        Colgalarraga                   StLpena                        StLzeile                      Cinmitchell                   Monlansing                    Pitslaught                    Monvanderwal                  NYMtfernandez                 SnFmartinez                   Houbagwell                    Colhayes                      Coleyoung      

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gauravkhandave/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stopset = set(stopwords.words('english'))

In [9]:
# Add some more stop words to the set
stopset.update(['aa','aaa','ab','bb','hr','[', ']', '{', '}','From','To'])

In [10]:
stopset

{'From',
 'To',
 '[',
 ']',
 u'a',
 'aa',
 'aaa',
 'ab',
 u'about',
 u'above',
 u'after',
 u'again',
 u'against',
 u'ain',
 u'all',
 u'am',
 u'an',
 u'and',
 u'any',
 u'are',
 u'aren',
 u'as',
 u'at',
 'bb',
 u'be',
 u'because',
 u'been',
 u'before',
 u'being',
 u'below',
 u'between',
 u'both',
 u'but',
 u'by',
 u'can',
 u'couldn',
 u'd',
 u'did',
 u'didn',
 u'do',
 u'does',
 u'doesn',
 u'doing',
 u'don',
 u'down',
 u'during',
 u'each',
 u'few',
 u'for',
 u'from',
 u'further',
 u'had',
 u'hadn',
 u'has',
 u'hasn',
 u'have',
 u'haven',
 u'having',
 u'he',
 u'her',
 u'here',
 u'hers',
 u'herself',
 u'him',
 u'himself',
 u'his',
 u'how',
 'hr',
 u'i',
 u'if',
 u'in',
 u'into',
 u'is',
 u'isn',
 u'it',
 u'its',
 u'itself',
 u'just',
 u'll',
 u'm',
 u'ma',
 u'me',
 u'mightn',
 u'more',
 u'most',
 u'mustn',
 u'my',
 u'myself',
 u'needn',
 u'no',
 u'nor',
 u'not',
 u'now',
 u'o',
 u'of',
 u'off',
 u'on',
 u'once',
 u'only',
 u'or',
 u'other',
 u'our',
 u'ours',
 u'ourselves',
 u'out',
 u'over

In [11]:
vectorizer = TfidfVectorizer(stop_words=stopset,use_idf=True,ngram_range=(1,3))

In [12]:
X = vectorizer.fit_transform(corpus)

In [13]:
X[0]

<1x176428 sparse matrix of type '<type 'numpy.float64'>'
	with 199 stored elements in Compressed Sparse Row format>

In [14]:
X

<994x176428 sparse matrix of type '<type 'numpy.float64'>'
	with 302947 stored elements in Compressed Sparse Row format>

In [15]:
print X[0]

  (0, 175468)	0.0784681968211
  (0, 10974)	0.0784681968211
  (0, 128521)	0.0784681968211
  (0, 44546)	0.0784681968211
  (0, 58617)	0.0784681968211
  (0, 115635)	0.0784681968211
  (0, 84396)	0.0784681968211
  (0, 151521)	0.0784681968211
  (0, 46937)	0.0784681968211
  (0, 95665)	0.0784681968211
  (0, 145346)	0.0784681968211
  (0, 166405)	0.0784681968211
  (0, 102113)	0.0784681968211
  (0, 44708)	0.0784681968211
  (0, 168058)	0.0784681968211
  (0, 77380)	0.0784681968211
  (0, 115544)	0.0784681968211
  (0, 125561)	0.0784681968211
  (0, 79794)	0.0784681968211
  (0, 96392)	0.0784681968211
  (0, 41110)	0.0784681968211
  (0, 33136)	0.0784681968211
  (0, 18265)	0.0784681968211
  (0, 44696)	0.0784681968211
  (0, 33140)	0.0784681968211
  :	:
  (0, 138484)	0.0443419306061
  (0, 152605)	0.0514228317175
  (0, 63201)	0.0551760257681
  (0, 173778)	0.0256169679244
  (0, 25584)	0.0443419306061
  (0, 151933)	0.024645272705
  (0, 15188)	0.0580957722552
  (0, 65916)	0.0380502147649
  (0, 48536)	0.074055173

In [16]:
X.shape

(994, 176428)

### Matrix has 994 documents and 176428 'concepts'
#### We will truncate it to find concecpts in the corpus

In [17]:
lsa = TruncatedSVD(n_components=14, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=14, n_iter=100,
       random_state=None, tol=0.0)

In [18]:
lsa.components_

array([[ 0.00056901,  0.00056901,  0.00056901, ...,  0.00047405,
         0.00047405,  0.00047405],
       [ 0.03443523,  0.0052594 , -0.02075006, ..., -0.00052214,
        -0.00052214, -0.00052214],
       [-0.00972563,  0.02271531, -0.07695336, ...,  0.00036821,
         0.00036821,  0.00036821],
       ..., 
       [-0.02288842, -0.0015804 , -0.01100626, ..., -0.00075889,
        -0.00075889, -0.00075889],
       [ 0.00033756, -0.02273201, -0.07288953, ..., -0.00033091,
        -0.00033091, -0.00033091],
       [-0.00349988,  0.02370239, -0.03820116, ..., -0.00029227,
        -0.00029227, -0.00029227]])

In [19]:
terms = vectorizer.get_feature_names()

In [20]:
terms

[u'aaaaarrrrghearlier',
 u'aaaaarrrrghearlier thread',
 u'aaaaarrrrghearlier thread commented',
 u'aaaand',
 u'aaaand improve',
 u'aaaand improve would',
 u'aaabe',
 u'aaabe successful',
 u'aaabe successful majorscurrent',
 u'aaajavy',
 u'aaajavy lopez',
 u'aaajavy lopez proven',
 u'aaamajors',
 u'aaamajors duty',
 u'aaamajors duty main',
 u'aaamost',
 u'aaamost catchers',
 u'aaamost catchers need',
 u'aaano',
 u'aaano maybe',
 u'aaano maybe need',
 u'aaasee',
 u'aaasee catchers',
 u'aaasee catchers need',
 u'aaboston',
 u'aaboston red',
 u'aaboston red sox',
 u'aactually',
 u'aactually dave',
 u'aactually dave stewart',
 u'aafter',
 u'aafter striking',
 u'aafter striking bad',
 u'aardvark',
 u'aardvark spicalalocuscomwarren',
 u'aardvark spicalalocuscomwarren usui',
 u'aaron',
 u'aaron alextrebek',
 u'aaron alextrebek said',
 u'aaron beardsubject',
 u'aaron beardsubject torre',
 u'aaron ernie',
 u'aaron ernie lombardi',
 u'aaron frank',
 u'aaron frank robinson',
 u'aaron henderson',
 

In [21]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:14]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
year
team
would
game
article
games
last
good
lost
baseball
one
think
players
like
 
Concept 1:
jewish
come
vb
lowenstein
jewish baseball
john lowenstein
anyone
stankowitz
able come
sandy koufax
players past
anyone come
except sandy
except sandy koufax
 
Concept 2:
year
well
would
team
aaaand improve
good
win
clutch
baseball
players
writesin
player
see
sabo
 
Concept 3:
runs
run
year
good
baseball
would
morris
got
extra
aaaand improve
play
hits stolen
dcon
best
 
Concept 4:
team
think
win
play
go
wins
going
roger
hit
know
last year
games
yankees
aaabe
 
Concept 5:
lost
good
morris
let
sox
hall
say
back
run
well
take
players
aaaand
pitcher
 
Concept 6:
one
even
year
team
season
university
back
good
last year
anyone
cubs
know
aaaaarrrrghearlier thread commented
pitching
 
Concept 7:
aaaand improve
first
last
games
see
season
hitter
well
hit
really
article
league
day
win
 
Concept 8:
team
really
games
still
last
game
even
make
win
better
aaaand improve
article
least
runs
 
Conce

### After LSA of baseball news group, Following concepts are found :
<li>Last year baseball team lost games
<li>Jewish come to see jewish baseball
<li>Second teams hit much and win games
<li>People could also win pitching first