In [8]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [14]:
import spacy
from spacy.lang.en import English


In [26]:
import pandas as pd

In [15]:
parser = English()


In [16]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/moksha/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [18]:
from nltk.corpus import wordnet as wn

In [19]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma


In [20]:
from nltk.stem.wordnet import WordNetLemmatizer

In [21]:
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [22]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/moksha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


In [29]:
df = pd.read_csv('tf_idf_summarized.csv', index_col = 0)


In [30]:
df

Unnamed: 0,movie,summary
0,chicken_run,its plump chickens maintain that strange hapt...
1,the_taste_of_others,succeeds with believable characters and situat...
2,yi_yi,this intimate family portrait peels away layer...
3,memento,i can t remember when a movie has seemed so cl...
4,the_life_and_times_of_hank_greenberg,writer director aviva kempner crafted this lov...
...,...,...
2095,emma_2020,if anyone were to push emma woodhouse into a l...
2096,yes_god_yes,karen maine s script confronts catholicism s b...
2097,blood_on_her_name,allowing the viewer to piece things together o...
2098,sylvies_love,a sublimely versatile and aching performance b...


In [47]:
#import random
text_data = []
for summary in df['summary']:
    tokens = prepare_text_for_lda(summary)
    #if random.random() > .99:
        #print(tokens)
    text_data.append(tokens)

In [72]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [74]:
corpus = [dictionary.doc2bow(text) for text in text_data]


In [75]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [76]:
import gensim
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')

In [77]:
topics = ldamodel.print_topics(num_words = 5)
for topic in topics:
    print(topic)

(0, '0.015*"film" + 0.009*"story" + 0.009*"family" + 0.007*"even" + 0.006*"movie"')
(1, '0.013*"film" + 0.006*"make" + 0.006*"movie" + 0.005*"experience" + 0.004*"piece"')
(2, '0.011*"first" + 0.010*"film" + 0.007*"make" + 0.006*"movie" + 0.005*"even"')
(3, '0.012*"make" + 0.007*"documentary" + 0.006*"people" + 0.006*"white" + 0.005*"movie"')
(4, '0.010*"movie" + 0.010*"film" + 0.006*"going" + 0.006*"watching" + 0.006*"wolfwalkers"')
(5, '0.011*"right" + 0.010*"documentary" + 0.010*"camp" + 0.009*"good" + 0.007*"crip"')
(6, '0.015*"film" + 0.010*"like" + 0.008*"movie" + 0.007*"make" + 0.006*"story"')
(7, '0.016*"film" + 0.010*"documentary" + 0.006*"movie" + 0.005*"like" + 0.005*"making"')
(8, '0.013*"film" + 0.009*"drama" + 0.009*"mercy" + 0.008*"maine" + 0.007*"claire"')
(9, '0.014*"boseman" + 0.014*"rainey" + 0.011*"wilson" + 0.010*"fading" + 0.010*"chadwick"')


In [56]:
pip install pyLDAvis


Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 2.3 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting funcy
  Downloading funcy-1.16-py2.py3-none-any.whl (32 kB)
Collecting numpy>=1.20.0
  Downloading numpy-1.21.0-cp38-cp38-macosx_10_9_x86_64.whl (16.9 MB)
[K     |████████████████████████████████| 16.9 MB 1.5 MB/s eta 0:00:01
Collecting pandas>=1.2.0
  Downloading pandas-1.2.5-cp38-cp38-macosx_10_9_x86_64.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 4.4 MB/s eta 0:00:01     |███████████████████████████▎    | 9.0 MB 11.8 MB/s eta 0:00:01
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25ldone
[?25h  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-no

In [65]:
import pyLDAvis.gensim_models

In [82]:
import pyLDAvis.gensim_models as pyldavis

In [87]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')

In [89]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
lda_display = pyldavis.prepare(lda10, corpus, dictionary)
pyLDAvis.show(lda_display)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [70]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1476fd670>

In [92]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model10.gensim')
pyLDAvis.enable_notebook()
lda_display = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_display)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()