# Import data set


In [1]:
import pandas as pd
df = pd.read_csv('result.csv')

In [6]:
pos_df = pd.DataFrame()
pos_df = df[df['Category'] == 1]

# Preprocess


In [7]:
import gensim
data = list(pos_df['Text'])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['but', 'cinderella', 'gets', 'my', 'vote', 'not', 'only', 'for', 'the', 'worst', 'of', 'disneys', 'princess', 'movies', 'but', 'for', 'the', 'worst', 'movie', 'the', 'company', 'made', 'during', 'walts', 'lifetime', 'the', 'music', 'is', 'genuinely', 'pretty', 'and', 'the', 'story', 'deserves', 'to', 'be', 'called', 'classic', 'what', 'fails', 'in', 'this', 'movie', 'are', 'the', 'characters', 'particularly', 'the', 'title', 'character', 'who', 'could', 'only', 'be', 'called', 'the', 'heroine', 'in', 'the', 'loosest', 'sense', 'of', 'the', 'term', 'after', 'brief', 'prologue', 'the', 'audience', 'is', 'introduced', 'to', 'cinderella', 'she', 'is', 'waking', 'up', 'in', 'the', 'morning', 'and', 'singing', 'dream', 'is', 'wish', 'your', 'heart', 'makes', 'this', 'establishes', 'her', 'as', 'an', 'idealist', 'and', 'thus', 'deserving', 'of', 'our', 'sympathy', 'unfortunately', 'the', 'script', 'gives', 'us', 'no', 'clue', 'as', 'to', 'what', 'she', 'is', 'dreaming', 'about', 'freedom', 

# Create bigram and trigram for the model


In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['but', 'cinderella', 'gets', 'my', 'vote', 'not', 'only', 'for', 'the', 'worst', 'of', 'disneys', 'princess', 'movies', 'but', 'for', 'the', 'worst', 'movie', 'the', 'company', 'made', 'during', 'walts', 'lifetime', 'the', 'music', 'is', 'genuinely', 'pretty', 'and', 'the', 'story', 'deserves', 'to', 'be', 'called', 'classic', 'what', 'fails', 'in', 'this', 'movie', 'are', 'the', 'characters', 'particularly', 'the', 'title', 'character', 'who', 'could', 'only', 'be', 'called', 'the', 'heroine', 'in', 'the', 'loosest', 'sense', 'of', 'the', 'term', 'after', 'brief', 'prologue', 'the', 'audience', 'is', 'introduced', 'to', 'cinderella', 'she', 'is', 'waking_up', 'in', 'the', 'morning', 'and', 'singing', 'dream', 'is', 'wish', 'your', 'heart', 'makes', 'this', 'establishes', 'her', 'as', 'an', 'idealist', 'and', 'thus', 'deserving', 'of', 'our', 'sympathy', 'unfortunately', 'the', 'script', 'gives', 'us', 'no', 'clue', 'as', 'to', 'what', 'she', 'is', 'dreaming', 'about', 'freedom', 'fro

Remove stop words


In [9]:
import nltk
nltk.download('stopwords')

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from gensim.utils import simple_preprocess
import spacy

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['get', 'vote', 'bad', 'disney', 'princess', 'movie', 'bad', 'movie', 'company', 'make', 'walt', 'lifetime', 'music', 'genuinely', 'pretty', 'story', 'deserve', 'call', 'classic', 'fail', 'movie', 'character', 'particularly', 'title', 'character', 'call', 'heroine', 'loose', 'sense', 'term', 'audience', 'introduce', 'wake', 'morning', 'singing', 'dream', 'wish', 'heart', 'make', 'establishe', 'idealist', 'thus', 'deserve', 'sympathy', 'unfortunately', 'script', 'give', 'clue', 'dream', 'freedom', 'servant', 'role', 'respect', 'stepfamily', 'talk', 'mouse', 'bird', 'song', 'cut', 'movie', 'present', 'special', 'feature', 'section', 'late', 'dvd', 'cinderella', 'relate', 'wish', 'many', 'work', 'efficiently', 'go', 'girlfriend', 'short', 'cinderella', 'bland', 'character', 'passively', 'accept', 'abuse', 'escape', 'unspoken', 'dream', 'relief', 'assert', 'remind', 'stepmother', 'still', 'member', 'family', 'give', 'permission', 'go', 'ball', 'complete', 'housework', 'find', 'wear', 'tok

# Create a dictionary and a corpus


In [13]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Build LDA model


In [14]:
# Build LDA model
pos_lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)

In [16]:
pos_lda_model.save("pos_model_lda.model")

In [18]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(pos_lda_model.print_topics())
doc_lda = pos_lda_model[corpus]

[(0,
  '0.062*"deal" + 0.040*"oscar" + 0.038*"crew" + 0.031*"push" + 0.029*"tough" '
  '+ 0.025*"radio" + 0.024*"addition" + 0.023*"sum" + 0.020*"fresh" + '
  '0.018*"silent"'),
 (1,
  '0.049*"war" + 0.030*"western" + 0.024*"bond" + 0.023*"student" + '
  '0.022*"apparently" + 0.022*"human" + 0.019*"message" + 0.019*"documentary" '
  '+ 0.016*"earth" + 0.016*"machine"'),
 (2,
  '0.047*"zombie" + 0.030*"innocent" + 0.019*"highlight" + '
  '0.018*"supporting_cast" + 0.018*"ultimate" + 0.017*"dynamic" + '
  '0.015*"threat" + 0.015*"psychological" + 0.014*"kudo" + 0.012*"fare"'),
 (3,
  '0.101*"season" + 0.025*"condition" + 0.020*"irony" + 0.019*"th" + '
  '0.019*"spoof" + 0.017*"web" + 0.016*"freak" + 0.014*"airplane" + '
  '0.013*"absence" + 0.010*"staple"'),
 (4,
  '0.093*"movie" + 0.026*"get" + 0.026*"see" + 0.020*"watch" + 0.017*"really" '
  '+ 0.017*"go" + 0.016*"people" + 0.015*"make" + 0.014*"know" + 0.013*"love"'),
 (5,
  '0.039*"vampire" + 0.034*"fellow" + 0.027*"truck" + 0.026*"o

# Evaluate the model


In [21]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', pos_lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=pos_lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -12.91864646842168

Coherence Score:  0.4254692122408879
