In [33]:
! pip install session_info

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [34]:
! pip install gensim==3.8.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
! pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [36]:
import nltk
import re
import session_info

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import CoherenceModel
from gensim.models import LsiModel, LdaModel

In [37]:
sns.set_context('talk')

In [38]:
session_info.show()

In [39]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Data

In [40]:
data_file = 'DSP453_ClassCorpus_v1.csv'

In [41]:
class_corpus = pd.read_csv(data_file)

In [42]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Doc_ID                    100 non-null    int64 
 1   DSI_Title                 100 non-null    object
 2   Text                      100 non-null    object
 3   Submission File Name      100 non-null    object
 4   Student Name              100 non-null    object
 5   Genre of Movie            100 non-null    object
 6   Review Type (pos or neg)  100 non-null    object
 7   Movie Title               100 non-null    object
dtypes: int64(1), object(7)
memory usage: 6.4+ KB


# Preprocessing

In [43]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

In [44]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(tokenized_text):
    return [w for w in tokenized_text if not w in STOP_WORDS]

In [45]:
def normalize(input_text):
    '''
    Normalization involves the following steps:
    1. Remove punctuation
    2. Lower case all words
    3. Remove tags (i.e., HTML tags)
    4. Remove all special characters and digits
    '''
    text = remove_punctuation(input_text)
    text = lower_case(text)
    text = remove_tags(text)
    text = remove_special_chars_and_digits(text)

    return text

In [46]:
def tokenize(text):
    '''
    Tokenization involves the following steps:
    1. Break text down to tokens (i.e., words separated by white spaces)
    2. Remove stop words from the tokens generated in step 1
    '''
    tokens = nltk.word_tokenize(text)
    tokenized_text = remove_stop_words(tokens)
    
    return tokenized_text

In [47]:
def lemmatize(tokenized_text, lemmatizer=WordNetLemmatizer()):
    '''
    Lemmatization is applied to each word in the list of normalized tokens
    (stop words are removed)
    '''
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [48]:
def join_tokens(lemmatized_tokens):
    return ' '.join(lemmatized_tokens)

In [49]:
class_corpus['normalized_review'] = class_corpus['Text'].apply(normalize)
class_corpus['tokenized_review'] = class_corpus['normalized_review'].apply(tokenize)
class_corpus['lemmatized_tokens'] = class_corpus['tokenized_review'].apply(lemmatize)
class_corpus['lemmatized_text'] = class_corpus['lemmatized_tokens'].apply(join_tokens)

In [50]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Doc_ID                    100 non-null    int64 
 1   DSI_Title                 100 non-null    object
 2   Text                      100 non-null    object
 3   Submission File Name      100 non-null    object
 4   Student Name              100 non-null    object
 5   Genre of Movie            100 non-null    object
 6   Review Type (pos or neg)  100 non-null    object
 7   Movie Title               100 non-null    object
 8   normalized_review         100 non-null    object
 9   tokenized_review          100 non-null    object
 10  lemmatized_tokens         100 non-null    object
 11  lemmatized_text           100 non-null    object
dtypes: int64(1), object(11)
memory usage: 9.5+ KB


# Latent Semantic Analysis (LSA)

In [51]:
# Experiment with this number to be 2, 4, 6, 8 and analyze themes in the topics
NUM_TOPICS = 2

In [52]:
processed_text = class_corpus['lemmatized_tokens'].tolist()

In [53]:
dictionary = corpora.Dictionary(processed_text)

In [54]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_text]

In [55]:
lsamodel = LsiModel(doc_term_matrix,
                    num_topics=NUM_TOPICS,
                    id2word=dictionary,
                    power_iters=100)

In [56]:
lsamodel.print_topics(num_words=10)

[(0,
  '0.373*"movie" + 0.327*"film" + 0.231*"one" + 0.222*"bond" + 0.217*"time" + 0.166*"like" + 0.136*"get" + 0.119*"even" + 0.111*"character" + 0.109*"u"'),
 (1,
  '0.642*"bond" + 0.242*"craig" + 0.222*"time" + 0.214*"die" + 0.157*"james" + -0.134*"holmes" + -0.119*"get" + -0.114*"film" + -0.099*"watson" + -0.097*"u"')]

In [57]:
topics = [2, 4, 6, 8]
coherence_values = {}

for t in topics:
    lsamodel = LsiModel(doc_term_matrix,
                        num_topics=t,
                        id2word=dictionary,
                        power_iters=100)

    coherence_model_lsa = CoherenceModel(model=lsamodel, 
                                         dictionary=dictionary, 
                                         texts=processed_text, 
                                         coherence='c_v')
    
    coherence_lsa = coherence_model_lsa.get_coherence()

    coherence_values[str(t) + ' topics'] = coherence_lsa 


In [58]:
print(coherence_values)

{'2 topics': 0.5120108356375777, '4 topics': 0.5465698609462359, '6 topics': 0.5534136861822961, '8 topics': 0.512325677049202}


# Latent Dirichlet Allocation (LDA)

The implementation of LDA in gensim needs a careful tuning of two hyperparameters - iterations and passes. The [documentation](https://radimrehurek.com/gensim/models/ldamodel.html) and the suggested [tutorial](https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html) suggest to use high values for both these parameters.   

In [59]:
# Experiment with this number to be 2, 4, 6, 8 and analyze themes in the topics
NUM_TOPICS = 4

In [60]:
ldamodel = LdaModel(doc_term_matrix,
                    num_topics=NUM_TOPICS,
                    id2word = dictionary,
                    alpha='auto',
                    eta='auto',
                    eval_every=1,
                    iterations=100,
                    passes=20,
                    random_state=20130810)



In [61]:
lda_display = gensimvis.prepare(ldamodel, 
                                doc_term_matrix, 
                                dictionary, 
                                sort_topics=False)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [62]:
pyLDAvis.display(lda_display)

In [63]:
ldamodel.print_topics(num_words=10)

[(0,
  '0.007*"oblivion" + 0.006*"jack" + 0.005*"cruise" + 0.005*"movie" + 0.005*"film" + 0.004*"earth" + 0.004*"one" + 0.004*"victoria" + 0.004*"kosinski" + 0.003*"harper"'),
 (1,
  '0.010*"movie" + 0.008*"one" + 0.008*"film" + 0.006*"toxic" + 0.005*"avenger" + 0.005*"get" + 0.005*"time" + 0.004*"troma" + 0.003*"way" + 0.003*"u"'),
 (2,
  '0.009*"film" + 0.005*"movie" + 0.005*"like" + 0.005*"bryan" + 0.005*"one" + 0.003*"equilibrium" + 0.003*"kim" + 0.003*"also" + 0.003*"emotion" + 0.003*"bale"'),
 (3,
  '0.011*"movie" + 0.011*"film" + 0.007*"time" + 0.007*"one" + 0.006*"bond" + 0.006*"like" + 0.004*"get" + 0.004*"even" + 0.004*"holmes" + 0.004*"character"')]

In [64]:
topics = [2, 4, 6, 8]
coherence_values = {}

for t in topics:
    ldamodel = LdaModel(doc_term_matrix,
                        num_topics=t,
                        id2word = dictionary,
                        alpha='auto',
                        eta='auto',
                        iterations=100,
                        passes=20,
                        random_state=20130810)

    coherence_model_lda = CoherenceModel(model=ldamodel, 
                                         dictionary=dictionary, 
                                         texts=processed_text, 
                                         coherence='c_v')
    
    coherence_lda = coherence_model_lda.get_coherence()
    coherence_values[str(t) + ' topics'] = coherence_lda 



In [65]:
print(coherence_values)

{'2 topics': 0.2679107725896157, '4 topics': 0.3413349012727854, '6 topics': 0.3807188793837412, '8 topics': 0.36565980466991854}
