In [1]:
#! pip install gensim==3.8.3

In [6]:
#! pip install pyLDAvis

In [7]:
import nltk
import re

import pyLDAvis
import pyLDAvis.gensim as gensimvis

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import CoherenceModel
from gensim.models import LsiModel, LdaModel

In [8]:
sns.set_context('talk')

In [9]:
nltk.download('stopwords',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('omw-1.4',quiet=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jensen116/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
def add_movie_descriptor(data: pd.DataFrame, corpus_df: pd.DataFrame):
    """
    Adds "Movie Description" to the supplied dataframe, in the form {Genre}_{P|N}_{Movie Title}_{DocID}
    """
    review = np.where(corpus_df['Review Type (pos or neg)'] == 'Positive', 'P', 'N')
    data['Descriptor'] = corpus_df['Genre of Movie'] + '_' + corpus_df['Movie Title'] + '_' + review + '_' + corpus_df['Doc_ID'].astype(str)

def get_corpus_df(path):
    data = pd.read_csv(path, encoding="utf-8")
    add_movie_descriptor(data, data)
    sorted_data = data.sort_values(['Descriptor'])
    indexed_data = sorted_data.set_index(['Doc_ID'])
    indexed_data['Doc_ID'] = indexed_data.index
    return indexed_data

# Data

In [11]:
CORPUS_PATH=\
'https://raw.githubusercontent.com/djp840/MSDS_453_Public/main/MSDS453_ClassCorpus/MSDS453_QA_20220906.csv'
class_corpus = get_corpus_df(CORPUS_PATH)

In [12]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 40 to 199
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   DSI_Title                 200 non-null    object
 1   Text                      200 non-null    object
 2   Submission File Name      200 non-null    object
 3   Student Name              200 non-null    object
 4   Genre of Movie            200 non-null    object
 5   Review Type (pos or neg)  200 non-null    object
 6   Movie Title               200 non-null    object
 7   Descriptor                200 non-null    object
 8   Doc_ID                    200 non-null    int64 
dtypes: int64(1), object(8)
memory usage: 15.6+ KB


# Preprocessing

In [13]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

In [14]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(tokenized_text):
    return [w for w in tokenized_text if not w in STOP_WORDS]

In [15]:
def normalize(input_text):
    '''
    Normalization involves the following steps:
    1. Remove punctuation
    2. Lower case all words
    3. Remove tags (i.e., HTML tags)
    4. Remove all special characters and digits
    '''
    text = remove_punctuation(input_text)
    text = lower_case(text)
    text = remove_tags(text)
    text = remove_special_chars_and_digits(text)

    return text

In [16]:
def tokenize(text):
    '''
    Tokenization involves the following steps:
    1. Break text down to tokens (i.e., words separated by white spaces)
    2. Remove stop words from the tokens generated in step 1
    '''
    tokens = nltk.word_tokenize(text)
    tokenized_text = remove_stop_words(tokens)
    
    return tokenized_text

In [17]:
def lemmatize(tokenized_text, lemmatizer=WordNetLemmatizer()):
    '''
    Lemmatization is applied to each word in the list of normalized tokens
    (stop words are removed)
    '''
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [18]:
def join_tokens(lemmatized_tokens):
    return ' '.join(lemmatized_tokens)

In [19]:
class_corpus['normalized_review'] = class_corpus['Text'].apply(normalize)
class_corpus['tokenized_review'] = class_corpus['normalized_review'].apply(tokenize)
class_corpus['lemmatized_tokens'] = class_corpus['tokenized_review'].apply(lemmatize)
class_corpus['lemmatized_text'] = class_corpus['lemmatized_tokens'].apply(join_tokens)

In [20]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 40 to 199
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   DSI_Title                 200 non-null    object
 1   Text                      200 non-null    object
 2   Submission File Name      200 non-null    object
 3   Student Name              200 non-null    object
 4   Genre of Movie            200 non-null    object
 5   Review Type (pos or neg)  200 non-null    object
 6   Movie Title               200 non-null    object
 7   Descriptor                200 non-null    object
 8   Doc_ID                    200 non-null    int64 
 9   normalized_review         200 non-null    object
 10  tokenized_review          200 non-null    object
 11  lemmatized_tokens         200 non-null    object
 12  lemmatized_text           200 non-null    object
dtypes: int64(1), object(12)
memory usage: 21.9+ KB


# Latent Semantic Analysis (LSA)

In [21]:
# Experiment with this number to be 2, 4, 6, 8 and analyze themes in the topics
NUM_TOPICS = 2

In [22]:
processed_text = class_corpus['lemmatized_tokens'].tolist()

In [23]:
dictionary = corpora.Dictionary(processed_text)

In [24]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_text]

In [25]:
lsamodel = LsiModel(doc_term_matrix,
                    num_topics=NUM_TOPICS,
                    id2word=dictionary,
                    power_iters=100)

In [26]:
lsamodel.print_topics(num_words=10)

[(0,
  '0.416*"film" + 0.317*"movie" + 0.230*"one" + 0.172*"like" + 0.171*"time" + 0.132*"scene" + 0.124*"get" + 0.122*"make" + 0.114*"character" + 0.111*"even"'),
 (1,
  '-0.405*"anderton" + 0.300*"movie" + -0.262*"spielberg" + -0.259*"report" + -0.245*"minority" + -0.220*"film" + -0.156*"technology" + -0.148*"precrime" + -0.121*"future" + -0.096*"dick"')]

In [27]:
topics = [2, 4, 6, 8]
coherence_values = {}

for t in topics:
    lsamodel = LsiModel(doc_term_matrix,
                        num_topics=t,
                        id2word=dictionary,
                        power_iters=100)

    coherence_model_lsa = CoherenceModel(model=lsamodel, 
                                         dictionary=dictionary, 
                                         texts=processed_text, 
                                         coherence='c_v')
    
    coherence_lsa = coherence_model_lsa.get_coherence()

    coherence_values[str(t) + ' topics'] = coherence_lsa 


In [28]:
print(coherence_values)

{'2 topics': 0.28665398782892637, '4 topics': 0.2847956138114596, '6 topics': 0.3215720600003867, '8 topics': 0.3674387789524757}


# Latent Dirichlet Allocation (LDA)

The implementation of LDA in gensim needs a careful tuning of two hyperparameters - iterations and passes. The [documentation](https://radimrehurek.com/gensim/models/ldamodel.html) and the suggested [tutorial](https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html) suggest to use high values for both these parameters.   

In [29]:
# Experiment with this number to be 2, 4, 6, 8 and analyze themes in the topics
NUM_TOPICS = 4

In [30]:
ldamodel = LdaModel(doc_term_matrix,
                    num_topics=NUM_TOPICS,
                    id2word = dictionary,
                    alpha='auto',
                    eta='auto',
                    eval_every=1,
                    iterations=100,
                    passes=20,
                    random_state=20130810)

In [31]:
lda_display = gensimvis.prepare(ldamodel, 
                                doc_term_matrix, 
                                dictionary, 
                                sort_topics=False)

  default_term_info = default_term_info.sort_values(


In [32]:
pyLDAvis.display(lda_display)

In [33]:
ldamodel.print_topics(num_words=10)

[(0,
  '0.011*"film" + 0.006*"movie" + 0.005*"one" + 0.004*"like" + 0.004*"batman" + 0.004*"elle" + 0.004*"get" + 0.004*"character" + 0.004*"make" + 0.003*"way"'),
 (1,
  '0.011*"bond" + 0.007*"time" + 0.007*"movie" + 0.006*"film" + 0.005*"one" + 0.004*"craig" + 0.004*"die" + 0.003*"get" + 0.003*"preston" + 0.003*"like"'),
 (2,
  '0.009*"film" + 0.009*"movie" + 0.006*"one" + 0.006*"like" + 0.004*"make" + 0.003*"get" + 0.003*"character" + 0.003*"lost" + 0.003*"even" + 0.003*"cruise"'),
 (3,
  '0.011*"film" + 0.009*"movie" + 0.007*"one" + 0.005*"like" + 0.005*"time" + 0.004*"scene" + 0.004*"even" + 0.004*"make" + 0.004*"get" + 0.004*"horror"')]

In [34]:
topics = [2, 4, 6, 8]
coherence_values = {}

for t in topics:
    ldamodel = LdaModel(doc_term_matrix,
                        num_topics=t,
                        id2word = dictionary,
                        alpha='auto',
                        eta='auto',
                        iterations=100,
                        passes=20,
                        random_state=20130810)

    coherence_model_lda = CoherenceModel(model=ldamodel, 
                                         dictionary=dictionary, 
                                         texts=processed_text, 
                                         coherence='c_v')
    
    coherence_lda = coherence_model_lda.get_coherence()
    coherence_values[str(t) + ' topics'] = coherence_lda 

In [35]:
print(coherence_values)

{'2 topics': 0.24759174785329124, '4 topics': 0.23238133565424585, '6 topics': 0.2762221214715257, '8 topics': 0.27565416248731994}
