In [3]:
# Works based on https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# Preprocessing based on https://github.com/TegarSU/Topic-Modelling/blob/master/Preprocessing.ipynb
import pandas as pd
import numpy as np
import json
import os
from gensim.models.ldamodel import LdaModel
import matplotlib.pyplot as plt

import spacy
from spacy.lang.id import Indonesian

nlp = Indonesian()  # use directly
stopwords = spacy.lang.id.stop_words.STOP_WORDS
stopwords.add("nya")

In [4]:
df = pd.read_csv(os.path.join('output_pemilu_at_kpu.csv'), usecols = ['created_at','text','username','id_tweet'])

## Preprocessing

### Cleaning

In [5]:
df['clean'] = df['text'].str.lower()
df['clean'] = df['clean'].str.normalize('NFKD')
df['clean'] = df['clean'].str.replace(r"http\S+|https\S+|pic\.\S+|pictw\S+", " ") # remove http, https and pic.
df['clean'] = df['clean'].str.replace(r"RT @[\w_]+ :", " ") # remove RT @ :
df['clean'] = df['clean'].str.replace("@[A-Za-z0-9_]+", "") # remove mention and hashtag
# df['clean'] = df['clean'].str.replace(r"[():/.,!?\"\\-]"," ") # remove punctuation
df['clean'] = df['clean'].str.replace(r"[^a-zA-Z]"," ") # remove non alphabet r'[^a-zA-Z]'
# Tambahin remove 'CC'
df['clean'] = df['clean'].str.lstrip()
df['clean'] = df['clean'].str.replace(r"\s\s+"," ") # remove multi space after regex

In [6]:
df['clean']

0        mereka itu korban yg kalah di inget siapa yg l...
1        penomena tdk kuat scra lembaga atw pelaksanaan...
2        dengerin ini kata satgas anti pemilu culas jgn...
3        ngapain pemilu ulang broo klu kita sdh menang ...
4        i demen bercerita jd setelah pemilu mereka jad...
5        kl menerapkan iso pasti kelar minggu ngga ada ...
6        hayooo simak baik orasi ketua satgas anti cura...
7        hampir semua daerah ricuh kok dibilang pemilu ...
8        ga habis pikir sm pasang badan habis an seolah...
9        pemilu itu pasti ada kecurangan baik dr dan ka...
10       bapak semua yth kalian semua bangsa indonesia ...
11               pemilu terburuk dan paling buruk sedunia 
12       ini kah yg dimaksud pemilu berjalan baik dan l...
13       anda punya bukti jokowi curangi hasil pemilu j...
14       tidak cukup dengan hanya santunan merekapun pa...
15       paham sekali ya pak bagian yang sering terjadi...
16       kata siapa ya pemilu ini aman dan jurdil mohon.

### Tokenize

In [7]:
def df_tokenizer(tweets_df):
    tokenized = [token.text for token in nlp(tweets_df)]
    return tokenized

In [8]:
tokenized = df['clean'].apply(df_tokenizer)

In [9]:
tokenized

0        [mereka, itu, korban, yg, kalah, di, inget, si...
1        [penomena, tdk, kuat, scra, lembaga, atw, pela...
2        [dengerin, ini, kata, satgas, anti, pemilu, cu...
3        [ngapain, pemilu, ulang, broo, klu, kita, sdh,...
4        [i, demen, bercerita, jd, setelah, pemilu, mer...
5        [kl, menerapkan, iso, pasti, kelar, minggu, ng...
6        [hayooo, simak, baik, orasi, ketua, satgas, an...
7        [hampir, semua, daerah, ricuh, kok, dibilang, ...
8        [ga, habis, pikir, sm, pasang, badan, habis, a...
9        [pemilu, itu, pasti, ada, kecurangan, baik, dr...
10       [bapak, semua, yth, kalian, semua, bangsa, ind...
11         [pemilu, terburuk, dan, paling, buruk, sedunia]
12       [ini, kah, yg, dimaksud, pemilu, berjalan, bai...
13       [anda, punya, bukti, jokowi, curangi, hasil, p...
14       [tidak, cukup, dengan, hanya, santunan, mereka...
15       [paham, sekali, ya, pak, bagian, yang, sering,...
16       [kata, siapa, ya, pemilu, ini, aman, dan, jurd.

### Slang Handling

In [25]:
def slang(tokenized_):
    slang_word = json.loads(open(os.path.join(os.getcwd(),'slang.json'),'r').read())
    for index in range(len(tokenized_)):
        for key, value in slang_word.items():
            for v in value:
                if tokenized_[index] == v:
                    tokenized_[index] = key
                else:
                    continue
    value = " ".join(str(v) for v in tokenized_)
    return value

In [26]:
slang_cleaned = tokenized.apply(slang)

### Lemmatization

In [27]:
def lemmatize(slang_cleaned_):
    lemmatized = [token.lemma_ for token in nlp(slang_cleaned_)]
    return lemmatized

In [28]:
lemmatized = slang_cleaned.apply(lemmatize)

### Stopword Removal

In [29]:
def stopword_removal(lemmatized_):
    clean = []
    for i in lemmatized_:
        if i not in stopwords:
            clean.append(i)

    return clean

In [30]:
clean = lemmatized.apply(stopword_removal)

In [31]:
df['clean'] = clean

In [32]:
clean

0         [korban, kalah, inget, loloskan, pemilu, rentak]
1        [penomena, kuat, lembaga, laksana, baca, dr, m...
2        [dengar, satgas, anti, pemilu, culas, cundang,...
3        [pemilu, ulang, broo, menang, spy, jokowi, sid...
4        [suka, cerita, pemilu, pinteran, dikit, moga, ...
5        [terap, iso, selesai, minggu, korban, romusha,...
6        [hayooo, simak, orasi, ketua, satgas, anti, cu...
7        [daerah, ricuh, dibilang, pemilu, lancar, aman...
8         [habis, pikir, pasang, badan, habis, an, pemilu]
9        [pemilu, curang, dr, pemilu, selenggarakan, ma...
10       [hormat, bangsa, indonesia, sih, pecah, belah,...
11                        [pemilu, terburuk, buruk, dunia]
12                    [kah, maksud, pemilu, jalan, lancar]
13          [bukti, jokowi, curangi, hasil, pemilu, nuduh]
14       [santun, merekapun, pemilu, jujur, adil, jujur...
15                       [paham, kali, ya, curang, pemilu]
16       [ya, pemilu, aman, jujur, adil, mohon, diseles.

### Export to csv