In [1]:
# Works based on https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# Preprocessing based on https://github.com/TegarSU/Topic-Modelling/blob/master/Preprocessing.ipynb
import pandas as pd
import numpy as np
import json
import os
from gensim.models.ldamodel import LdaModel
import matplotlib.pyplot as plt
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
from gensim.models import Phrases
import pyLDAvis
import pyLDAvis.gensim
from random import randint
import ast
import logging
import operator

In [2]:
df = pd.read_csv(os.path.join(os.pardir,os.pardir,'data','05_out','5-KCore-tweet-clean.csv'))

In [3]:
df.head()

Unnamed: 0,text,username,modularity,node_id,clean
0,Mereka itu korban yg kalah di 2014. Inget siap...,b12_7hon,2,0,"['korban', 'kalah', 'loloskan', 'pemilu', 'ren..."
1,Penomena @KPU_RI @KPU_ID tdk kuat scra lembaga...,harun_nugraha,2,1,"['fenomena', 'kuat', 'lembaga', 'laksana', 'ba..."
2,ngapain pemilu ulang broo klu kita 02 sdh mena...,totonaser11,4,3,"['pemilu', 'ulang', 'broo', 'menang', 'jokowi'..."
3,"Kl menerapkan ISO pasti kelar 1 minggu, ngga a...",rohmatjabbar,4,5,"['terap', 'selesai', 'minggu', 'korban', 'romu..."
4,Hayooo @KPU_ID @bawaslu_RI simak baik2 orasi k...,use_will,3,6,"['ayo', 'simak', 'orasi', 'ketua', 'tugas', 'a..."


In [4]:
clean = df['clean']

In [5]:
clean.head()

0    ['korban', 'kalah', 'loloskan', 'pemilu', 'ren...
1    ['fenomena', 'kuat', 'lembaga', 'laksana', 'ba...
2    ['pemilu', 'ulang', 'broo', 'menang', 'jokowi'...
3    ['terap', 'selesai', 'minggu', 'korban', 'romu...
4    ['ayo', 'simak', 'orasi', 'ketua', 'tugas', 'a...
Name: clean, dtype: object

In [6]:
clean = list(map(ast.literal_eval,clean))

## Create Bigram

In [7]:
#https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html
bigram = Phrases(clean, min_count=10)
for idx in range(len(clean)):
    for token in bigram[clean[idx]]:
        if '_' in token:
            #Token is a bigram, add to document.
            clean[idx].append(token)

In [8]:
for i in range(0, 10):
    print(bigram[clean][i])

['korban', 'kalah', 'loloskan', 'pemilu', 'rentak']
['fenomena', 'kuat', 'lembaga', 'laksana', 'baca', 'munculnya', 'langgar', 'salah', 'orang', 'tinggal', 'gara', 'pemilu', 'aneka', 'ragam', 'heran', 'muncul', 'gugat', 'laksana']
['pemilu', 'ulang', 'broo', 'menang', 'jokowi', 'sidiq', 'tabliq', 'amanah', 'fatonah']
['terap', 'selesai', 'minggu', 'korban', 'romusha', 'rodi', 'proyek', 'pemilu', 'situng', 'kpu', 'system', 'audit', 'trial', 'validasi']
['ayo', 'simak', 'orasi', 'ketua', 'tugas', 'anti', 'curang', 'pemilu']
['daerah', 'ricuh', 'bilang', 'pemilu', 'lancar_aman', 'sukses', 'otak', 'kacau', 'sesat', 'pikir', 'lancar_aman']
['habis', 'pikir', 'pasang', 'badan', 'habis', 'pemilu']
['hormat', 'bangsa', 'indonesia', 'pecah_belah', 'anak_bangsa', 'laku', 'curang', 'pemilu', 'pilpres', 'rakyat', 'bodoh', 'pecah_belah', 'anak_bangsa']
['pemilu', 'buruk', 'buruk', 'dunia']
['kah', 'maksud', 'pemilu', 'jalan_lancar', 'jalan_lancar']


# LDA

In [11]:
# Logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### Build Bag Of Words Corpus Dictionary

In [12]:
dictionary = gensim.corpora.Dictionary(clean)

2019-09-09 16:03:03,545 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-09-09 16:03:03,722 : INFO : adding document #10000 to Dictionary(14673 unique tokens: ['kalah', 'korban', 'loloskan', 'pemilu', 'rentak']...)
2019-09-09 16:03:03,889 : INFO : adding document #20000 to Dictionary(21933 unique tokens: ['kalah', 'korban', 'loloskan', 'pemilu', 'rentak']...)
2019-09-09 16:03:03,899 : INFO : built Dictionary(22199 unique tokens: ['kalah', 'korban', 'loloskan', 'pemilu', 'rentak']...) from 20520 documents (total 276481 corpus positions)


In [13]:
# Preview
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 5:
        break

0 kalah
1 korban
2 loloskan
3 pemilu
4 rentak
5 aneka


## Filter token

In [14]:
dictionary.filter_extremes()

2019-09-09 16:03:11,289 : INFO : discarding 17181 tokens: [('loloskan', 2), ('pemilu', 20383), ('aneka', 3), ('munculnya', 2), ('fatonah', 2), ('sidiq', 1), ('tabliq', 1), ('romusha', 1), ('trial', 2), ('orasi', 3)]...
2019-09-09 16:03:11,290 : INFO : keeping 5018 tokens which were in no less than 5 and no more than 10260 (=50.0%) documents
2019-09-09 16:03:11,299 : INFO : resulting dictionary: Dictionary(5018 unique tokens: ['kalah', 'korban', 'rentak', 'baca', 'fenomena']...)


## Build BoW Corpus

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in clean]

#### Corpus Preview

In [16]:
def bow_corpus_sample(corpus, random_index):
    sample = corpus[random_index]
    print('Corpus sample index : ',random_index)
    for i in range(len(sample)):
        print("Word {} (\"{}\") appears {} time.".format(sample[i][0], 
                                                         dictionary[sample[i][0]], 
                                                         sample[i][1]))

In [17]:
sample_index = randint(0,10000)
bow_corpus_sample(bow_corpus, sample_index)

Corpus sample index :  3197
Word 153 ("proses") appears 1 time.
Word 249 ("moga") appears 1 time.
Word 541 ("april") appears 1 time.
Word 1202 ("jaga") appears 2 time.
Word 3865 ("lancarnya") appears 1 time.


## Find model with optimal Number Of Topics
https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

In [18]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
#         model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=2, workers=2)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=clean, start=2, limit=11, step=2)

In [None]:
# Show graph
limit=11; start=2; step=2;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics on BoW corpus")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"))
plt.show()
# plt.savefig(os.path.join(os.pardir,os.pardir,'data','06_out','50-KCore_model','50-KCore_ldamodel'),dpi=300)

# Feed best number of topics

In [None]:
coherence_values

In [None]:
# Please check best model (model with highest coherence score)
# lda_model = model_list[3]

In [20]:
print(lda_model)

LdaModel(num_terms=5018, num_topics=8, decay=0.5, chunksize=2000)


## Save Best Model

In [None]:
# lda_model.save(os.path.join(os.pardir,os.pardir,'data','06_out','5-KCore_model','ldamodel'))

## Load Model

In [19]:
lda_model = LdaModel.load(os.path.join(os.pardir,os.pardir,'data','06_out','5-KCore_model','ldamodel'), mmap='r')
print(lda_model)

2019-09-09 16:05:53,562 : INFO : loading LdaModel object from ..\..\data\06_out\5-KCore_model\ldamodel
2019-09-09 16:05:53,606 : INFO : loading expElogbeta from ..\..\data\06_out\5-KCore_model\ldamodel.expElogbeta.npy with mmap=r
2019-09-09 16:05:53,693 : INFO : setting ignored attribute id2word to None
2019-09-09 16:05:53,695 : INFO : setting ignored attribute dispatcher to None
2019-09-09 16:05:53,697 : INFO : setting ignored attribute state to None
2019-09-09 16:05:53,699 : INFO : loaded ..\..\data\06_out\5-KCore_model\ldamodel
2019-09-09 16:05:53,701 : INFO : loading LdaState object from ..\..\data\06_out\5-KCore_model\ldamodel.state
2019-09-09 16:05:53,726 : INFO : loaded ..\..\data\06_out\5-KCore_model\ldamodel.state


LdaModel(num_terms=5018, num_topics=8, decay=0.5, chunksize=2000)


In [21]:
coherence = CoherenceModel(model=lda_model, texts=clean, dictionary=dictionary, coherence='c_v')
print(coherence.get_coherence())

2019-09-09 16:07:39,912 : INFO : using ParallelWordOccurrenceAccumulator(processes=7, batch_size=64) to estimate probabilities from sliding windows
2019-09-09 16:07:47,275 : INFO : 7 accumulators retrieved from output queue
2019-09-09 16:07:47,338 : INFO : accumulated word occurrence stats for 18594 virtual documents


0.28721564785953935


In [22]:
# Show Topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

2019-09-09 16:08:10,746 : INFO : topic #0 (0.125): 0.020*"undang" + 0.015*"kpps" + 0.015*"suara" + 0.013*"surat" + 0.013*"terima" + 0.012*"kpu" + 0.012*"kasih" + 0.011*"terima_kasih" + 0.011*"presiden" + 0.011*"tugas"
2019-09-09 16:08:10,748 : INFO : topic #1 (0.125): 0.011*"moga" + 0.011*"korban" + 0.011*"kpu" + 0.009*"orang" + 0.009*"suara" + 0.009*"biar" + 0.008*"curang" + 0.008*"kotak" + 0.008*"indonesia" + 0.007*"pilih"
2019-09-09 16:08:10,749 : INFO : topic #2 (0.125): 0.031*"kpu" + 0.018*"curang" + 0.017*"calon" + 0.015*"tps" + 0.014*"pasang" + 0.012*"selenggara" + 0.011*"hasil" + 0.010*"pasang_calon" + 0.009*"orang" + 0.009*"hitung"
2019-09-09 16:08:10,749 : INFO : topic #3 (0.125): 0.036*"kpu" + 0.017*"selenggara" + 0.011*"hasil" + 0.011*"debat" + 0.008*"capres" + 0.008*"daftar" + 0.008*"undang" + 0.007*"main" + 0.007*"rakyat" + 0.007*"ketua"
2019-09-09 16:08:10,750 : INFO : topic #4 (0.125): 0.034*"suara" + 0.030*"pilih" + 0.027*"kpu" + 0.012*"orang" + 0.011*"surat" + 0.011*"

Topic: 0 
Words: 0.020*"undang" + 0.015*"kpps" + 0.015*"suara" + 0.013*"surat" + 0.013*"terima" + 0.012*"kpu" + 0.012*"kasih" + 0.011*"terima_kasih" + 0.011*"presiden" + 0.011*"tugas"
Topic: 1 
Words: 0.011*"moga" + 0.011*"korban" + 0.011*"kpu" + 0.009*"orang" + 0.009*"suara" + 0.009*"biar" + 0.008*"curang" + 0.008*"kotak" + 0.008*"indonesia" + 0.007*"pilih"
Topic: 2 
Words: 0.031*"kpu" + 0.018*"curang" + 0.017*"calon" + 0.015*"tps" + 0.014*"pasang" + 0.012*"selenggara" + 0.011*"hasil" + 0.010*"pasang_calon" + 0.009*"orang" + 0.009*"hitung"
Topic: 3 
Words: 0.036*"kpu" + 0.017*"selenggara" + 0.011*"hasil" + 0.011*"debat" + 0.008*"capres" + 0.008*"daftar" + 0.008*"undang" + 0.007*"main" + 0.007*"rakyat" + 0.007*"ketua"
Topic: 4 
Words: 0.034*"suara" + 0.030*"pilih" + 0.027*"kpu" + 0.012*"orang" + 0.011*"surat" + 0.011*"hak" + 0.010*"kotak" + 0.009*"surat_suara" + 0.007*"ktp" + 0.007*"kotak_suara"
Topic: 5 
Words: 0.042*"curang" + 0.017*"kali" + 0.013*"salah" + 0.012*"misi" + 0.011*"raky

## Visualization

In [23]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Classify Document to Topic

In [24]:
sample_index = randint(0,10000)
bow_corpus_sample(bow_corpus, sample_index)

Corpus sample index :  3132
Word 33 ("ayo") appears 1 time.
Word 34 ("curang") appears 1 time.
Word 51 ("pasang") appears 1 time.
Word 70 ("bukti") appears 2 time.
Word 97 ("kubu") appears 1 time.
Word 136 ("selenggara") appears 1 time.
Word 141 ("hitung") appears 1 time.
Word 153 ("proses") appears 1 time.
Word 165 ("awasi") appears 1 time.
Word 281 ("hukum") appears 2 time.
Word 301 ("calon") appears 1 time.
Word 308 ("pasang_calon") appears 1 time.
Word 520 ("cek") appears 1 time.
Word 537 ("bijak") appears 1 time.
Word 803 ("transparansi") appears 1 time.
Word 1914 ("proses_hukum") appears 1 time.
Word 2393 ("siapkan") appears 2 time.
Word 3840 ("manfaatkan") appears 1 time.
Word 3841 ("siapkan_bukti") appears 2 time.


In [25]:
# Mencari probabilitas tertinggi topik untuk dokumen
for index, score in sorted(lda_model[bow_corpus[sample_index]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7548775672912598	 
Topic: 0.042*"curang" + 0.017*"kali" + 0.013*"salah" + 0.012*"misi" + 0.011*"rakyat" + 0.010*"visi" + 0.009*"visi_misi" + 0.009*"kpu" + 0.009*"laku" + 0.008*"selenggara"

Score: 0.21383167803287506	 
Topic: 0.031*"kpu" + 0.018*"curang" + 0.017*"calon" + 0.015*"tps" + 0.014*"pasang" + 0.012*"selenggara" + 0.011*"hasil" + 0.010*"pasang_calon" + 0.009*"orang" + 0.009*"hitung"


In [26]:
# Buat list probabilitas tertinggi topik untuk semua dokumen
topic_list = []
for i in range(len(bow_corpus)):
    sorted_probability = sorted(lda_model.get_document_topics(bow_corpus[i]), key = lambda x: x[1], reverse=True)
    topic_list.append(sorted_probability[0][0])

In [27]:
df['topic'] = topic_list

In [28]:
df.tail(10)

Unnamed: 0,text,username,modularity,node_id,clean,topic
20510,Pertama nih di 2019!! Mumpung masih anget! Apa...,_pemilihpemula,2,12271,"['mumpung', 'hangat', 'pemilu', 'admin', 'kasi...",1
20511,Mending pemilu ditunda smpai jokowi siapp..,abaz009,4,13209,"['mending', 'pemilu', 'tunda', 'jokowi']",3
20512,Maka dari itu aneh bener dah pemilu skrang rus...,koerniad1,4,13210,"['aneh', 'pemilu', 'rusak', 'tatanan', 'demokr...",5
20513,Kok @KPU_ID jadi ngga kredibel gini...?. Semua...,binarmentari_ok,3,4809,"['kredibel', 'perilaku', 'nonton', 'nasional',...",6
20514,Mereka tu dah sadar kalah dalam segala hal dan...,fakirelmu,4,13211,"['sadar', 'kalah', 'proses', 'pemilu', 'menang...",6
20515,Yang namanya sumpah dan janji sdh tak berharga...,uman_2009,3,11498,"['nama', 'sumpah', 'janji', 'harga', 'rezim', ...",7
20516,Mana ada wasit terserah Pemain . Wasit itu mem...,prabow0fans,3,5526,"['wasit', 'serah', 'main', 'wasit', 'pimpin', ...",6
20517,berarti bukan kampanye .. lihat di UU ttg pemi...,pendekarmalu,2,13212,"['arti', 'kampanye', 'lihat', 'undang', 'pemil...",0
20518,Diskusi akhir tahun kemarin dan sempat bahas m...,jamalboegis,3,851,"['diskusi', 'kemarin', 'bahas', 'kotak', 'kard...",4
20519,bener kata ibu prof mantan komisioner @KPU_ID ...,sameercimy_,4,199,"['profesor', 'mantan', 'komisioner', 'lupa', '...",5


In [None]:
# Export to csv
# df.to_csv(os.path.join(os.pardir,os.pardir,'data','06_out','5-KCore-tweet-clean-topic.csv'), index=False)