## LSA Lab

In [2]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [202]:

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [203]:
dataset.target_names

['rec.sport.baseball']

In [7]:
len(corpus)

994

In [271]:
print(corpus[1 ])

From: schmke@cco.caltech.edu (Kevin Todd Schmidt)
Subject: NL OPI through first week+
Organization: California Institute of Technology, Pasadena
Lines: 184
NNTP-Posting-Host: sandman.caltech.edu

Here is the OPI (Offensive Production Index) for all NL players with at
least 10 at-bats.

It is early in the season so there are some high numbers.  Barry Bonds
finished last season at 0.795.

I welcome comments and suggestions.

Kevin

League OPI: 0.410
League BA:  0.252
League SLG: 0.375
League OBA: 0.321

Rank Player                 OPI     BA    SLG    OBA
-----------------------------------------------------
1    Phi,daulton           1.101  0.333  0.875  0.515
2    Phi,kruk              1.069  0.429  0.821  0.529
3    Cub,grace             1.007  0.452  0.742  0.514
4    Cub,may               0.931  0.389  0.889  0.421
5    Col,boston            0.888  0.545  0.545  0.545
6    Pit,bell              0.873  0.429  0.714  0.467
7    Col,galarraga         0.867  0.458  0.708  0.458
8    StL

In [261]:
stopset = set(stopwords.words('english'))
stopset.update(['subject', 'organisation', 'lines', 'article', 'NNTP-Posting-Host','weeks', 'little','early', 'comments',
                'suggestions','last', 'sign','forget','consider','go','keep','keeps','looking','please','>','mail','computer',
                'science','list','include','tonight','charles','pretty','much','rap','something','nothing','think','care'
                'things','perhaps','leave','nichols','thanks','robert','nntp','however','matter','mind','mark','obviously','edu'
                ,'two','first','back','would','re','cs','00','000','com','one','may','like','get','three','writes','maybe','also',
                'year','morris','really','even','people','long','never','see','university','since','11','ca','say','0010','music'
                ,'755','002','let','john','ever','doesn','got','number','david','come','12','said','001 100','lot'
                ,'000th','002251w','001211 18457','na','001','0023','10','003','005','ramsey', '003015','006','beloved'
                '0000ahc', 'udcps3','cps', '0000ahc', 'udcps3','cps', '004746', 'many', 'hr' ,'452','007','dave'
                
                
               ])

###  TF-IDF Vectorizing

#### Bigrams

In [262]:

vectorizer = TfidfVectorizer(stop_words= stopset,use_idf=True, ngram_range=(1, 2))

X = vectorizer.fit_transform(corpus)

In [263]:
X[0]

<1x88110 sparse matrix of type '<type 'numpy.float64'>'
	with 127 stored elements in Compressed Sparse Row format>

In [264]:
print(X[0])

  (0, 9427)	0.0980225542251
  (0, 25448)	0.0980225542251
  (0, 87747)	0.0980225542251
  (0, 68574)	0.0980225542251
  (0, 37441)	0.0980225542251
  (0, 48914)	0.0791743243739
  (0, 32312)	0.0980225542251
  (0, 53728)	0.0980225542251
  (0, 76454)	0.0980225542251
  (0, 84997)	0.0980225542251
  (0, 56236)	0.0775729316316
  (0, 31221)	0.0980225542251
  (0, 49154)	0.0980225542251
  (0, 46160)	0.0980225542251
  (0, 62324)	0.0980225542251
  (0, 67290)	0.0980225542251
  (0, 47280)	0.0980225542251
  (0, 54019)	0.0599077192399
  (0, 77693)	0.0980225542251
  (0, 29553)	0.0980225542251
  (0, 25469)	0.0980225542251
  (0, 17677)	0.0980225542251
  (0, 31216)	0.0980225542251
  (0, 25471)	0.0885984392995
  (0, 32551)	0.0980225542251
  :	:
  (0, 78590)	0.0603260945227
  (0, 39473)	0.0667163220843
  (0, 21552)	0.0522403830207
  (0, 16195)	0.0715657188027
  (0, 40622)	0.0456754705225
  (0, 67414)	0.07366157054
  (0, 57206)	0.14732314108
  (0, 18581)	0.132105929938
  (0, 20203)	0.139500418897
  (0, 8631)	0.0

In [265]:
X.shape

(994, 88110)

### Applying LSA

In [266]:
lsa = TruncatedSVD(n_components=15, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=15, n_iter=100,
       random_state=None, tol=0.0)

In [267]:
lsa.components_[2]

array([ 0.00306077, -0.00450568, -0.01105854, ..., -0.00070911,
        0.000802  ,  0.00065452])

In [268]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
team
game
baseball
games
players
good
organization
runs
hit
time
 
Concept 1:
team
braves
news
could
good
00bjgood leo
distribution
bonds
home
didn
 
Concept 2:
team
know
day
runs
didn
0096a95c
better
base
season
baseball
 
Concept 3:
game
team
games
time
baseball
0062 lafibm
organization
home
season
host
 
Concept 4:
season
play
best
host
won
league
new
player
runs
0096a95c a0cbe0e8
 
Concept 5:
still
ball
better
organization
braves
ll
usa
anyone
toronto
fan
 
Concept 6:
hit
players
time
know
00bjgood leo
teams
runs
big
new
host
 
Concept 7:
posting
game
runs
braves
0096b0f0 c5de05a0
pitching
might
posting host
host
0096b0f0
 
Concept 8:
baseball
players
host
game
fans
could
roger
win
ball
00bjgood leo
 
Concept 9:
team
right
time
best
player
world
second
hit
games
roger
 
Concept 10:
game
braves
pitching
posting host
0062 lafibm
005314
good
play
mets
guys
 
Concept 11:
games
runs
0096b0f0 c5de05a0
ve
ll
know
winning
won
career
pitcher
 
Concept 12:
sox
games
00bjgood
host


#### Trigrams

In [272]:
vectorizer = TfidfVectorizer(stop_words= stopset,use_idf=True, ngram_range=(1, 3))

X = vectorizer.fit_transform(corpus)

In [279]:
lsa = TruncatedSVD(n_components=15, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=15, n_iter=100,
       random_state=None, tol=0.0)

In [280]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
team
game
baseball
players
games
good
jewish
organization
hit
runs
 
Concept 1:
005314
win
time
posting
hit
runs
player
well
braves
games
 
Concept 2:
team
runs
play
braves
0062 lafibm lafayette
organization
gant
pitching
won
sox
 
Concept 3:
league
0096a95c a0cbe0e8
won
williams
braves
player
play
mets
005314 5700
base
 
Concept 4:
host
posting
anyone
distribution
run
rbi
roger
good
could
game
 
Concept 5:
team
well
better
hitter
williams
season
make
005314 5700 mnemosyne
around
michael
 
Concept 6:
good
way
day
point
players
jhu
play
stats
hp
0062 lafibm lafayette
 
Concept 7:
hit
baseball
good
0096a95c
005314
know
fans
time
game
anyone
 
Concept 8:
0062
good
runs
know
0062 lafibm lafayette
reply
games
players
posting host
005314
 
Concept 9:
0096a95c a0cbe0e8
news
organization
michael
good
posting host
could
time
win
jewish
 
Concept 10:
game
play
win
005314
roger
gant
0062 lafibm lafayette
0062 lafibm
best
001211 18457 adobe
 
Concept 11:
runs
good
player
well
players
ny