# Example (Latent Semantic Analysis)

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
# load negative reviews 
corpus_neg = pd.read_fwf('Data/MovieSentiment/rt-polarity.neg',header=None,names=['movie_review','sentiment'])
corpus_neg['sentiment'] = 'neg'
print(corpus_neg.shape)
corpus_neg.head()

(5331, 2)


Unnamed: 0,movie_review,sentiment
0,"simplistic , silly and tedious .",neg
1,"it's so laddish and juvenile , only teenage bo...",neg
2,exploitative and largely devoid of the depth o...,neg
3,[garbus] discards the potential for pathologic...,neg
4,a visually flashy but narratively opaque and e...,neg


In [3]:
# load positive reviews 
corpus_pos = pd.read_fwf('Data/MovieSentiment/rt-polarity.pos',header=None,names=['movie_review','sentiment'])
corpus_pos['sentiment'] = 'pos'
print(corpus_pos.shape)
corpus_pos.head()

(5331, 2)


Unnamed: 0,movie_review,sentiment
0,the rock is destined to be the 21st century's ...,pos
1,"the gorgeously elaborate continuation of "" the...",pos
2,effective but too-tepid biopic,pos
3,if you sometimes like to go to the movies to h...,pos
4,"emerges as something rare , an issue movie tha...",pos


In [4]:
# concatenate negative and positive reviews,randomize order and reset index
corpus = pd.concat([corpus_neg,corpus_pos]).sample(frac=1.0,random_state=0).reset_index(drop=True)
corpus.sentiment = corpus.sentiment.astype('category')
print(corpus.shape)
corpus.head()

(10662, 2)


Unnamed: 0,movie_review,sentiment
0,the sentimental cliches mar an otherwise excel...,pos
1,"if you love the music , and i do , its hard to...",pos
2,"though harris is affecting at times , he canno...",neg
3,poignant japanese epic about adolescent anomie...,pos
4,"cantet perfectly captures the hotel lobbies , ...",pos


In [5]:
# print first 3 reviews
for ix in range (3):# concatenate negative and positive reviews,randomize order and reset index
    print(corpus.movie_review[ix])
    print('----------')
    print(corpus.sentiment[ix])
    print()

the sentimental cliches mar an otherwise excellent film . a powerful performance from mel gibson and a brutal 90-minute battle sequence that does everything but issue you a dog-tag and an m-16 .
----------
pos

if you love the music , and i do , its hard to imagine having more fun watching a documentary . . .
----------
pos

though harris is affecting at times , he cannot overcome the sense that pumpkin is a mere plot pawn for two directors with far less endearing disabilities .
----------
neg



In [6]:
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words= 'english')
attributes = vectorizer.fit_transform(corpus.movie_review)
vocabulary = pd.Series(vectorizer.get_feature_names())

In [7]:
attributes.shape

(10662, 18067)

In [8]:
vocabulary.shape

(18067,)

In [9]:
vocabulary.head()

0     00
1    000
2    007
3     10
4    100
dtype: object

In [10]:
print(attributes)

  (0, 31)	0.30171901115514116
  (0, 15819)	0.2889508680939305
  (0, 4660)	0.21861479119282762
  (0, 8574)	0.24044207592555794
  (0, 4658)	0.15915668831837568
  (0, 14101)	0.23082557252793925
  (0, 1413)	0.22767393286434728
  (0, 10158)	0.19514417106843365
  (0, 165)	0.2175097614810294
  (0, 2098)	0.24044207592555794
  (0, 6779)	0.26712358336120023
  (0, 9965)	0.26712358336120023
  (0, 11587)	0.16815914209800692
  (0, 12074)	0.19912117505824092
  (0, 6087)	0.09205747471999999
  (0, 5599)	0.21158797500225573
  (0, 9755)	0.30171901115514116
  (0, 2849)	0.20738305603364804
  (0, 14085)	0.21976001250604862
  (1, 4648)	0.33480127894870537
  (1, 17542)	0.3424970674175355
  (1, 6558)	0.3187507424270541
  (1, 7359)	0.41312287168421535
  (1, 7930)	0.4148622238748484
  (1, 7297)	0.3310581378002287
  :	:
  (10658, 14329)	0.23068997552081705
  (10659, 14547)	0.3720543034366151
  (10659, 2425)	0.3720543034366151
  (10659, 7799)	0.3720543034366151
  (10659, 12098)	0.3720543034366151
  (10659, 17765)	

In [11]:
attributes.indices

array([   31, 15819,  4660, ...,  8403, 16217,  6087], dtype=int32)

In [12]:
attributes.indptr

array([     0,     19,     27, ..., 102985, 102991, 102999], dtype=int32)

In [13]:
attributes.indices[0:19]

array([   31, 15819,  4660,  8574,  4658, 14101,  1413, 10158,   165,
        2098,  6779,  9965, 11587, 12074,  6087,  5599,  9755,  2849,
       14085], dtype=int32)

In [14]:
attributes.indices[19:27]

array([ 4648, 17542,  6558,  7359,  7930,  7297, 10497,  9514],
      dtype=int32)

In [15]:
review = []
words   = [] 
tfidf   = []
ptr  = attributes.indptr
ix   = attributes.indices
data = attributes.data
k = 0
for i in range(len(ptr)-1):
     word_list = vocabulary[ix[ptr[i]:ptr[i+1]]]
     data_list = data[ix[ptr[i]:ptr[i+1]]]
     for j in range(len(word_list)):
            words.append(word_list.iloc[j])
            tfidf.append(data_list[j])
            review.append(k)
     k = k + 1
    
df = pd.DataFrame()
df['review'] = review
df['words']   = words
df['tfidf']  = tfidf
df.head(3)

Unnamed: 0,review,words,tfidf
0,0,16,0.362919
1,0,tag,0.419458
2,0,dog,0.191063


In [16]:
df.head(25)

Unnamed: 0,review,words,tfidf
0,0,16,0.362919
1,0,tag,0.419458
2,0,dog,0.191063
3,0,issue,0.434604
4,0,does,0.21688
5,0,sequence,0.242235
6,0,battle,0.320308
7,0,minute,0.271665
8,0,90,0.318394
9,0,brutal,0.588853


In [17]:
df = df.merge(corpus.sentiment,left_on='review',right_index=True)
df.head(30)

Unnamed: 0,review,words,tfidf,sentiment
0,0,16,0.362919,pos
1,0,tag,0.419458,pos
2,0,dog,0.191063,pos
3,0,issue,0.434604,pos
4,0,does,0.21688,pos
5,0,sequence,0.242235,pos
6,0,battle,0.320308,pos
7,0,minute,0.271665,pos
8,0,90,0.318394,pos
9,0,brutal,0.588853,pos


In [18]:
corpus.movie_review.iloc[1] #article number 1

'if you love the music , and i do , its hard to imagine having more fun watching a documentary . . .'

In [19]:
df[df.review==1]

Unnamed: 0,review,words,tfidf,sentiment
19,1,documentary,0.261074,pos
20,1,watching,0.162054,pos
21,1,fun,0.242207,pos
22,1,having,0.453376,pos
23,1,imagine,0.196357,pos
24,1,hard,0.366753,pos
25,1,music,0.240651,pos
26,1,love,0.255954,pos


In [20]:
word_ratings = df.pivot_table(index = "words",
                             columns = "sentiment",
                             values = "tfidf",
                             aggfunc = "sum")
word_ratings.head()

sentiment,neg,pos
words,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.301719,
0,0.577902,
7,0.43723,
10,3.606631,1.923537
100,1.43241,0.318313


In [21]:
word_ratings_diff = word_ratings.pos - word_ratings.neg
word_ratings_diff.head()

words
00          NaN
000         NaN
007         NaN
10    -1.683095
100   -1.114097
dtype: float64

In [22]:
word_ratings_diff.sort_values(ascending=False).head(10)

words
film            61.148300
best            35.149080
life            29.581342
entertaining    24.146976
look            23.103809
manages         20.403439
performances    20.135124
makes           20.111728
portrait        19.439150
love            18.172733
dtype: float64

In [23]:
word_ratings_diff.sort_values(ascending=True).head(10) #negative review

words
movie     -70.295190
just      -40.400646
bad       -30.029831
minutes   -29.072627
feels     -26.306481
like      -23.436447
script    -22.665584
doesn     -22.021548
dull      -20.736435
action    -19.862206
dtype: float64

In [24]:
attributes.shape

(10662, 18067)

In [25]:
svd = TruncatedSVD(n_components = 30)
attributes_compressed = svd.fit_transform(attributes)
svd.explained_variance_ratio_.sum().round(3)

0.047

In [27]:
attributes_compressed = pd.DataFrame(attributes_compressed)
df = pd.concat([corpus,attributes_compressed], axis=1)
df.head(3)
#negative numbers -> cannot use Naive Bayes

Unnamed: 0,movie_review,sentiment,0,1,2,3,4,5,6,7,...,20,21,22,23,24,25,26,27,28,29
0,the sentimental cliches mar an otherwise excel...,pos,0.077596,0.076318,0.025737,-0.012952,-0.004265,-0.005886,0.006199,-0.003257,...,-0.003226,0.002702,0.008894,-0.018016,-0.036273,0.014371,-0.023031,0.027625,0.008321,-0.008989
1,"if you love the music , and i do , its hard to...",pos,0.095755,0.011204,-0.074454,-2e-05,-0.022782,-0.02253,0.010818,-0.034613,...,-0.128788,0.066406,0.033606,-0.017808,0.144434,-0.000746,-0.082103,0.054126,-0.140468,-0.058364
2,"though harris is affecting at times , he canno...",neg,0.040396,0.009843,-0.016318,0.009309,-0.002316,-0.016082,0.018887,-0.01184,...,0.009748,0.015176,-0.00506,-0.012776,-0.008562,0.014267,-0.00607,-0.021437,-0.010269,0.00447


In [29]:
k = 1
ix = [0,1,2,3,4,5,-5,-4,-3,-2,-1] #top 5 and bottoms 5
df.sort_values(k,ascending=False).iloc[ix,:][['movie_review','sentiment',k]].round(2)

Unnamed: 0,movie_review,sentiment,1
6238,do not see this film .,neg,0.7
5447,a funny film .,pos,0.35
6198,a compelling film .,pos,0.34
1420,the film is full of charm .,pos,0.32
4340,the film is one of the year's best .,pos,0.31
1467,the film is flat .,neg,0.29
3856,this is a very fine movie -- go see it .,pos,-0.3
6180,"i just saw this movie . . . well , it's probab...",pos,-0.31
9862,nothing about this movie works .,neg,-0.31
3183,a very funny movie .,pos,-0.4


In [34]:
components = svd.components_
k = 1
components = pd.Series(components[k,:], index = vocabulary)
components.sort_values(ascending = False)[ix].round(2)

film            0.70
best            0.04
performances    0.03
story           0.03
year            0.03
moving          0.03
doesn          -0.02
funny          -0.03
just           -0.04
bad            -0.07
movie          -0.67
dtype: float64