# Topic Modeling on Subtitles
https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [1]:
import pandas as pd
import numpy as np
import requests
import datetime
pd.options.display.max_columns = 500

## Subtitle extraction

In [2]:
def sub_extract(df):
    subtitles = []
    poms_endpoint = "https://rs.poms.omroep.nl/v1/api/subtitles/"
    for index, row in df.iterrows():
        mid = row['mid']
        r = requests.get(poms_endpoint + mid + "/nl_NL/CAPTION.vtt")
        sub = r.text.encode('ascii','ignore') # encode subtitles
        if not sub[:4] == '404:': # check if there are subtitles
            sub = sub.lower().split('\n\n') # lower and split
            sub = sub[1:] # remove first entry of subtitles 'webvtt'
            sub = [line.split('\n', 2)[-1].replace('\n', ' ') for line in sub] # remove display time info and '\n' in subtitle text
            subtitles.append(sub)
        else: 
            subtitles.append(None)
    df['sub'] = subtitles

In [3]:
# Test data
df = pd.DataFrame(np.array([["POW_04056750", "nos journaal"], 
                            ["AT_2111647", "widm ep6"], 
                            ["VPWON_1297943", "de kennis van nu"],
                            ["VPWON_1297062", "nieuwsuur"],
                            ["VPWON_1283689", "andere tijden 2018-06-02"],
                            ["POW_03746831", "kook mee met max"],
                            ["KN_1702321", "woezel en pip"],
                            ["POW_02990142", "nos journaal"]
                           ]),
                  columns=['mid', 'titel'])

sub_extract(df)
df

Unnamed: 0,mid,titel,sub
0,POW_04056750,nos journaal,"[in 2018 is de economie met 2,5 procent gegroe..."
1,AT_2111647,widm ep6,"[888, ga naar wieisdemol.nl voor extra informa..."
2,VPWON_1297943,de kennis van nu,"[888, je ziet dat data steeds belangrijker wor..."
3,VPWON_1297062,nieuwsuur,"[888, goedenavond. dit is nieuwsuur! verslaafd..."
4,VPWON_1283689,andere tijden 2018-06-02,"[888, over twee weken begint in rusland het we..."
5,POW_03746831,kook mee met max,"[888, in kook mee met max maak ik afwisselend ..."
6,KN_1702321,woezel en pip,"[888, komen jullie spelen?, *woezel en pip, wo..."
7,POW_02990142,nos journaal,"[888, liveprogramma, ondertiteling kan achterl..."


In [4]:
poms = spark.read.parquet("gs://mit-processed-events-prod.npo-data.nl/poms/")
df2 = poms.limit(1000).toPandas()
df2 = df2[(df2['workflow']!='DELETED') & (df2['class']=='Program') & (df2['avType']=='VIDEO') & (df2['type']=='BROADCAST')]
df2 = df2[(df2['sortDate']>=datetime.datetime(2017,1,1))] # since 2018

df2 = df2[['mid', 'title', 'sortDate']].copy()
# I suspect that the items wihout subtitles aren't on npostart (it's mostly regional tv)
sub_extract(df2)
df2 = df2.dropna() # drop rows with no subtitles
df2

Unnamed: 0,mid,title,sortDate,sub
25,AT_2089670,EenVandaag,2018-04-19 16:15:00,"[888, liveprogramma, ondertiteling kan achterl..."
27,VPWON_1264342,Doodgewoon,2017-04-05 17:20:00,"[888, rotterdam, ruim 600.000 inwoners, 176 na..."
56,VPWON_1282881,Metterdaad,2018-05-18 16:10:00,"[888, gladys woont in kenia, in de hoofdstad n..."
147,KN_1697671,De Boeddhistische Blik: Een andere kijk op kunst,2018-02-25 14:50:00,"[888, muziek, op dit moment toert zij door ned..."
162,POW_04053529,NOS Journaal,2019-02-22 05:30:00,"[888, liveprogramma, ondertiteling kan achterl..."
173,POW_03614983,NOS WK-kwalificatie Voetbal 1ste helft,2017-10-10 18:35:00,"[de uitstraling van de johan cruijff arena...,..."
395,POW_03451791,NOS Sportjournaal,2017-04-13 16:45:00,"[888, coach dortmund is boos op uefa., ajax zi..."
435,VPWON_1297779,De kijk van Koolhoven,2018-12-09 21:50:00,"[888, ja, geef maar toe., jij zag 'erotica' en..."
443,POW_03615022,NOS Studio Sport,2017-10-08 14:40:00,"[88., goedemiddag., straks nog jan lammers., m..."
456,POW_03685524,NOS Journaal,2018-12-06 07:29:00,"[888, goedemorgen., nederland moet naar nul sl..."


## Tokenization

In [8]:
#!pip install -U spacy
#!python -m spacy download nl
import spacy
nlp = spacy.load('nl')

#!pip install nltk
from nltk.stem.snowball import DutchStemmer
stemmer = DutchStemmer()

In [12]:
def tokens(df):
    tokens = []
    tokens_extra = []
    for index, row in df.iterrows():
        doc = nlp(u" ".join(row['sub']))
        # POS-tagging, tokenization, stop words, stemming
        tokens.append([(stemmer.stem(w.text)) for w in doc if not ((w.is_stop or w.is_punct) or len(w)<=4) ])
        #tokens.append([(w.text) for w in doc if not ((w.is_stop or w.is_punct) or len(w)<=4) ]) # without stemming
        tokens_extra.append([(w.text, stemmer.stem(w.text), w.pos_) for w in doc if not w.is_stop])
    df['tokens'] = tokens
    df['tokens_extra'] = tokens_extra

In [13]:
tokens(df)
df

Unnamed: 0,mid,titel,sub,tokens,tokens_extra
0,POW_04056750,nos journaal,"[in 2018 is de economie met 2,5 procent gegroe...","[economie, procent, gegroeid, minder, sted, ho...","[(2018, 2018, NUM), (economie, economie, NOUN)..."
1,AT_2111647,widm ep6,"[888, ga naar wieisdemol.nl voor extra informa...","[wieisdemol.nl, extra, informatie, kandidat, a...","[(888, 888, NUM), (ga, ga, VERB), (wieisdemol...."
2,VPWON_1297943,de kennis van nu,"[888, je ziet dat data steeds belangrijker wor...","[sted, belangrijker, wordt, sport, gebruikt, w...","[(888, 888, NUM), (ziet, ziet, VERB), (data, d..."
3,VPWON_1297062,nieuwsuur,"[888, goedenavond. dit is nieuwsuur! verslaafd...","[goedenavond, nieuwsur, verslaafd, zwar, pijns...","[(888, 888, NUM), (goedenavond, goedenavond, V..."
4,VPWON_1283689,andere tijden 2018-06-02,"[888, over twee weken begint in rusland het we...","[wek, begint, rusland, wereldkampioenschap, vo...","[(888, 888, NUM), (twee, twee, NUM), (weken, w..."
5,POW_03746831,kook mee met max,"[888, in kook mee met max maak ik afwisselend ...","[afwissel, sandra, ijsbrandy, gerecht, binn, t...","[(888, 888, NUM), (kook, kok, NOUN), (mee, mee..."
6,KN_1702321,woezel en pip,"[888, komen jullie spelen?, *woezel en pip, wo...","[kom, jullie, spel, woezel, woezel, spel, elka...","[(888, 888, NUM), (komen, kom, VERB), (jullie,..."
7,POW_02990142,nos journaal,"[888, liveprogramma, ondertiteling kan achterl...","[liveprogramma, ondertitel, achterlop, goedemi...","[(888, 888, NUM), (liveprogramma, liveprogramm..."


In [14]:
tokens(df2)
df2

Unnamed: 0,mid,title,sortDate,sub,tokens,tokens_extra
25,AT_2089670,EenVandaag,2018-04-19 16:15:00,"[888, liveprogramma, ondertiteling kan achterl...","[liveprogramma, ondertitel, achterlop, goedena...","[(888, 888, NUM), (liveprogramma, liveprogramm..."
27,VPWON_1264342,Doodgewoon,2017-04-05 17:20:00,"[888, rotterdam, ruim 600.000 inwoners, 176 na...","[rotterdam, 600.000, inwoner, nationaliteit, m...","[(888, 888, NUM), (rotterdam, rotterdam, NOUN)..."
56,VPWON_1282881,Metterdaad,2018-05-18 16:10:00,"[888, gladys woont in kenia, in de hoofdstad n...","[gladys, woont, kenia, hoofdstad, nairobi, eco...","[(888, 888, NUM), (gladys, gladys, NOUN), (woo..."
147,KN_1697671,De Boeddhistische Blik: Een andere kijk op kunst,2018-02-25 14:50:00,"[888, muziek, op dit moment toert zij door ned...","[muziek, moment, toert, nederland, voorstell, ...","[(888, 888, NUM), (muziek, muziek, NOUN), (mom..."
162,POW_04053529,NOS Journaal,2019-02-22 05:30:00,"[888, liveprogramma, ondertiteling kan achterl...","[liveprogramma, ondertitel, achterlop, goedemo...","[(888, 888, NUM), (liveprogramma, liveprogramm..."
173,POW_03614983,NOS WK-kwalificatie Voetbal 1ste helft,2017-10-10 18:35:00,"[de uitstraling van de johan cruijff arena...,...","[uitstral, johan, cruijff, arena, bepaald, fee...","[(uitstraling, uitstral, NOUN), (johan, johan,..."
395,POW_03451791,NOS Sportjournaal,2017-04-13 16:45:00,"[888, coach dortmund is boos op uefa., ajax zi...","[coach, dortmund, kans, schalk, liveprogramma,...","[(888, 888, NUM), (coach, coach, NOUN), (dortm..."
435,VPWON_1297779,De kijk van Koolhoven,2018-12-09 21:50:00,"[888, ja, geef maar toe., jij zag 'erotica' en...","[erotica, dacht, natur, allen, neukscenes, kij...","[(888, 888, NUM), (,, ,, PUNCT), (geef, gef, V..."
443,POW_03615022,NOS Studio Sport,2017-10-08 14:40:00,"[88., goedemiddag., straks nog jan lammers., m...","[goedemiddag, strak, lammer, monolog, parijs, ...","[(88, 88, NUM), (., ., PUNCT), (goedemiddag, g..."
456,POW_03685524,NOS Journaal,2018-12-06 07:29:00,"[888, goedemorgen., nederland moet naar nul sl...","[goedemorg, nederland, slachtoffer, verker, ka...","[(888, 888, NUM), (goedemorgen, goedemorg, NOU..."


## Topic modeling (LDA)

In [15]:
#!pip install gensim
from gensim import corpora
import pickle
import gensim

In [22]:
text_data = df.tokens

# create dictionary & convert to bag-of-words corpus
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

# save dict & corpus
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

# Use LDA to find 5 topics
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model.gensim')

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.006*"stemm" + 0.006*"rechtbank" + 0.005*"volgen" + 0.005*"organisatie"')
(1, u'0.006*"woezel" + 0.006*"forum" + 0.005*"oxycodon" + 0.005*"pijnstiller"')
(2, u'0.015*"groen" + 0.014*"jullie" + 0.014*"opdracht" + 0.010*"link"')
(3, u'0.013*"mens" + 0.010*"argentinie" + 0.008*"natur" + 0.007*"nederland"')
(4, u'0.008*"wordt" + 0.005*"eigen" + 0.004*"stat" + 0.004*"lat"')


In [23]:
text_data2 = df2.tokens

# create dictionary & convert to bag-of-words corpus
dictionary2 = corpora.Dictionary(text_data2)
corpus2 = [dictionary.doc2bow(text) for text in text_data2]

# save dict & corpus
pickle.dump(corpus2, open('corpus.pkl', 'wb'))
dictionary2.save('dictionary2.gensim')

# Use LDA to find 5 topics
NUM_TOPICS = 5
ldamodel2 = gensim.models.ldamodel.LdaModel(corpus2, num_topics = NUM_TOPICS, id2word=dictionary2, passes=15)
ldamodel2.save('model2.gensim')

topics2 = ldamodel2.print_topics(num_words=4)
for topic in topics2:
    print(topic)

(0, u'0.009*"eenrum" + 0.008*"hamer" + 0.005*"behaald" + 0.005*"barca"')
(1, u'0.011*"gelop" + 0.007*"ontzet" + 0.007*"theatermaker" + 0.007*"jongler"')
(2, u'0.004*"familie" + 0.004*"ingooi" + 0.003*"afstand" + 0.003*"terlouw"')
(3, u'0.013*"plekj" + 0.010*"hamer" + 0.007*"helft" + 0.007*"hoofdpijn"')
(4, u'0.010*"daarmee" + 0.010*"hoofdpijn" + 0.009*"plekj" + 0.009*"hamer"')


## Investigate Topics
https://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb

In [19]:
#!pip install pyldavis
import pyLDAvis
pyLDAvis.enable_notebook()
import matplotlib.pyplot as plt
%matplotlib inline
import pyLDAvis.gensim

In [20]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [21]:
pyLDAvis.gensim.prepare(ldamodel2, corpus2, dictionary2)