In [6]:
import numpy as np 
import pandas as pd 

import nltk
import string

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.utils import simple_preprocess

In [7]:
#text preprocessing (same as last week's lab)
reviews = pd.read_csv("imdbReviews.csv")

reviews.head()

Unnamed: 0,Index,URL,Text,Sentiment
0,3617,http://www.imdb.com/title/tt0210075/usercomments,Girlfight follows a project dwelling New York ...,POS
1,3671,http://www.imdb.com/title/tt0337640/usercomments,Hollywood North is an euphemism from the movie...,POS
2,3157,http://www.imdb.com/title/tt0303549/usercomments,That '70s Show is definitely the funniest show...,POS
3,660,http://www.imdb.com/title/tt0716825/usercomments,"9/10- 30 minutes of pure holiday terror. Okay,...",POS
4,265,http://www.imdb.com/title/tt0182225/usercomments,"A series of random, seemingly insignificant th...",POS


In [8]:
reviews['Sentiment'].value_counts()

Sentiment
POS    1000
NEG    1000
Name: count, dtype: int64

In [10]:
#save the labels and encode them as 1 and 0 for future classification/clustering
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
label = enc.fit_transform(reviews['Sentiment'])
print(label[:10])
print(reviews['Sentiment'][:10])

[1 1 1 1 1 1 1 1 1 1]
0    POS
1    POS
2    POS
3    POS
4    POS
5    POS
6    POS
7    POS
8    POS
9    POS
Name: Sentiment, dtype: object


In [11]:
#change the text column datatype to string
reviews = reviews.astype({'Text':'string'})

In [12]:
#get the review text for preprocessing
text = reviews['Text']

In [13]:
text1 = []

for review in text:
    #print(sentence)
    #remove punctuation
    review = review.translate(str.maketrans('', '', string.punctuation))  
    # remove digits/numbers
    review = review.translate(str.maketrans('', '', string.digits))
    #change to lowercase
    review = review.lower()
    #print(sentence)
    text1.append(review)

In [14]:
text1 = pd.Series(text1)

In [15]:
#remove stop words
    
#Setting English stopwords
stop_words = set(stopwords.words('english'))

text1 = text1.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [16]:
#apply stemming
ps = nltk.PorterStemmer()

text1 = text1.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
text1[:5]

0    girlfight follow project dwell new york high s...
1    hollywood north euphem movi industri went cana...
2    show definit funniest show current tv start wa...
3    minut pure holiday terror okay scari sure funt...
4    seri random seemingli insignific theft sister ...
dtype: object

In [17]:
reviews1 = list(zip(text1, label))

In [18]:
reviewsP = pd.DataFrame (reviews1, columns = ['Review', 'Sentiment'])
reviewsP

Unnamed: 0,Review,Sentiment
0,girlfight follow project dwell new york high s...,1
1,hollywood north euphem movi industri went cana...,1
2,show definit funniest show current tv start wa...,1
3,minut pure holiday terror okay scari sure funt...,1
4,seri random seemingli insignific theft sister ...,1
...,...,...
1995,tourist head ireland school trip learn druid e...,0
1996,two film use scare peopl god event horizon one...,0
1997,ulis literatur teacher arriv coastal town fell...,0
1998,um okay guess get whole shakycam gorillastyl f...,0


In [19]:
data = reviewsP.Review.values.tolist()

In [20]:
data

['girlfight follow project dwell new york high school girl sens futil world amateur box find self esteem purpos much although film box box film much almost smell sweat technic artist good shoot sens honesti realiti girlfight chick flick rocki rather human drama even viewer dont know box abl connect withgirlfight follow project dwell new york high school girl sens futil world amateur box find self esteem purpos much',
 'hollywood north euphem movi industri went canada make movi tax break cheaper cost civil citi like toronto case later vancouv peter obrian director probabl saw lot invad california movi seem right way deal arriv person tri capit econom canada presentedneedless say moon lantern success novel written canadian author turn flight bogota noth origin film great egotist hasbeen michael bayt obsess happen iran offer lead part turn disasterth film seem say mani cook spoil broth seem case ultim product save produc bobbi myer help sandi ryan around make documentari film shot toronto

In [22]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

words = list(sent_to_words(data))

In [23]:
print(words[:1][0][:30])

['girlfight', 'follow', 'project', 'dwell', 'new', 'york', 'high', 'school', 'girl', 'sens', 'futil', 'world', 'amateur', 'box', 'find', 'self', 'esteem', 'purpos', 'much', 'although', 'film', 'box', 'box', 'film', 'much', 'almost', 'smell', 'sweat', 'technic', 'artist']


In [24]:
import gensim.corpora as corpora                      

# Create Dictionary
id2word = corpora.Dictionary(words)                   

# Create Corpus
texts = words                                        

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]    

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 5), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 2), (12, 1), (13, 2), (14, 2), (15, 1), (16, 2), (17, 2), (18, 2), (19, 2), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 3), (26, 2), (27, 2), (28, 2), (29, 1)]


In [25]:
from pprint import pprint

# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]

[(0,
  '0.011*"film" + 0.007*"one" + 0.005*"charact" + 0.005*"movi" + 0.004*"time" '
  '+ 0.004*"great" + 0.004*"peopl" + 0.004*"show" + 0.004*"stori" + '
  '0.004*"way"'),
 (1,
  '0.014*"film" + 0.008*"movi" + 0.006*"one" + 0.006*"would" + 0.006*"stori" + '
  '0.005*"good" + 0.005*"like" + 0.005*"time" + 0.004*"make" + 0.004*"peopl"'),
 (2,
  '0.024*"movi" + 0.012*"film" + 0.008*"time" + 0.007*"good" + 0.007*"one" + '
  '0.006*"like" + 0.005*"make" + 0.005*"watch" + 0.004*"stori" + '
  '0.004*"first"'),
 (3,
  '0.012*"film" + 0.010*"movi" + 0.007*"one" + 0.005*"like" + 0.005*"much" + '
  '0.004*"make" + 0.004*"good" + 0.004*"play" + 0.003*"see" + 0.003*"seem"'),
 (4,
  '0.023*"film" + 0.010*"movi" + 0.009*"one" + 0.005*"get" + 0.005*"charact" + '
  '0.004*"like" + 0.004*"dont" + 0.004*"make" + 0.004*"even" + 0.004*"time"'),
 (5,
  '0.022*"movi" + 0.014*"film" + 0.009*"one" + 0.008*"like" + 0.007*"see" + '
  '0.005*"watch" + 0.005*"stori" + 0.005*"realli" + 0.004*"time" + '
  '0.004*"m

In [26]:
pip install pyldavis

Collecting pyldavis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting numexpr (from pyldavis)
  Downloading numexpr-2.14.1-cp311-cp311-win_amd64.whl.metadata (9.3 kB)
Collecting funcy (from pyldavis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 2.6/2.6 MB 16.7 MB/s eta 0:00:00
Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Downloading numexpr-2.14.1-cp311-cp311-win_amd64.whl (160 kB)
Installing collected packages: funcy, numexpr, pyldavis

   ------------- -------------------------- 1/3 [numexpr]
   ------------- -------------------------- 1/3 [numexpr]
   -------------------------- ------------- 2/3 [pyldavis]
   -------------------------- ------------- 2/3 [pyldavis]
   -------------------------- ------------- 2/3 [pyldavis]
   -------------------------- ------------- 2/3 [p

In [None]:
import pyLDAvis.gensim_models
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

LDAvis
# by sliding the lambda value, we can see how the term relevance changes. 1 means osme frequent terms that are also can be seem in other topics, while 0 means the terms that are more unique to the topic.