# Preparing Data

In [None]:
import pandas as pd 
import numpy as np
import nltk

In [None]:
dataSB = pd.read_csv('tweets_1.csv') #lokasi file

dataSB.head()

Unnamed: 0,date,username,tweet
0,2022-04-01 12:11:12,hooray4trees,the flu is a respiratory virus that disappears...
1,2022-04-01 05:22:28,PureShakti,disgraceful amp cowardly drmoore who has ab...
2,2022-04-01 03:32:45,ozjohnd,do we have a genuine incidence of long covid ...
3,2022-04-01 03:10:22,Variablefrog,woke the kids today with the news that schools...
4,2022-04-01 00:47:21,MsJKinderGators,i would be so grateful for any support helpin...


In [None]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas

dataSB['tweet'] = dataSB['tweet'].str.lower()

print('Case Folding Result : \n')
print(dataSB['tweet'].head(5))

Case Folding Result : 

0    the flu is a respiratory virus that disappears...
1    disgraceful   amp  cowardly drmoore who has ab...
2     do we have a genuine incidence of long covid ...
3    woke the kids today with the news that schools...
4     i would be so grateful for any support helpin...
Name: tweet, dtype: object


# Tokenizing

Menghapus karakter-karakter yang tidak penting dari text

In [None]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

In [None]:
# ------ Tokenizing ---------

nltk.download('punkt')

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
dataSB['tweet'] = dataSB['tweet'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

dataSB['tweet'] = dataSB['tweet'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

dataSB['tweet'] = dataSB['tweet'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

dataSB['tweet'] = dataSB['tweet'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

dataSB['tweet'] = dataSB['tweet'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

dataSB['tweet'] = dataSB['tweet'].apply(remove_singl_char)

# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

dataSB['tweet_tokens'] = dataSB['tweet'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(dataSB['tweet_tokens'].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Tokenizing Result : 

0    [the, flu, is, respiratory, virus, that, disap...
1    [disgraceful, amp, cowardly, drmoore, who, has...
2    [do, we, have, genuine, incidence, of, long, c...
3    [woke, the, kids, today, with, the, news, that...
4    [would, be, so, grateful, for, any, support, h...
Name: tweet_tokens, dtype: object


In [None]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

dataSB['tweet_tokens_fdist'] = dataSB['tweet_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(dataSB['tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(it, 4), (respiratory, 3), (is, 2), (virus, 2...
1    [(covid, 3), (amp, 2), (from, 2), (this, 2), (...
2    [(of, 5), (the, 4), (do, 2), (in, 2), (yet, 2)...
3    [(the, 4), (in, 2), (to, 2), (woke, 1), (kids,...
4    [(to, 2), (would, 1), (be, 1), (so, 1), (grate...
Name: tweet_tokens_fdist, dtype: object


# Stopwords
Membersihkan teks dari kata-kata yang termasuk ke dalam stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah', 'bisnis', 'pandemi', 'indonesia'])

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

dataSB['tweet_tokens_WSW'] = dataSB['tweet_tokens'].apply(stopwords_removal) 

print(dataSB['tweet_tokens_WSW'].head())
dataSB.to_csv('normalisaasi_tweets.csv')

0    [the, flu, is, respiratory, virus, that, disap...
1    [disgraceful, cowardly, drmoore, who, has, abs...
2    [do, we, have, genuine, incidence, of, long, c...
3    [woke, the, kids, today, with, the, news, that...
4    [would, be, so, grateful, for, any, support, h...
Name: tweet_tokens_WSW, dtype: object


# Normalisasi
Mengganti kata-kata tertentu dengan kata lain yang lebih tepat

In [None]:
normalizad_word = pd.read_csv('normalisaasi_tweets.csv') #lokasi file

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

dataSB['tweet_normalized'] = dataSB['tweet_tokens_WSW'].apply(normalized_term)

dataSB['tweet_normalized'].head(10)

0    [the, flu, is, respiratory, virus, that, disap...
1    [disgraceful, cowardly, drmoore, who, has, abs...
2    [do, we, have, genuine, incidence, of, long, c...
3    [woke, the, kids, today, with, the, news, that...
4    [would, be, so, grateful, for, any, support, h...
5    [pembelajaran, jarak, pjj, terpaksa, digelar, ...
6    [more, than, two, years, into, the, covid, pan...
7    [ty, for, your, leadership, constitutional, am...
8    [the, covid, pandemic, will, come, to, an, end...
9    [we, have, experienced, learning, loss, from, ...
Name: tweet_normalized, dtype: object

In [None]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in dataSB['tweet_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))

# from nltk.stem import PorterStemmer
# from nltk.tokenize import word_tokenize
  
# ps = PorterStemmer()

# # stemmed
# def stemmed_wrapper(term):
#     return ps.stem(term)

# term_dict = {}

# for document in dataSB['tweet_normalized']:
#     for term in document:
#         if term not in term_dict:
#             term_dict[term] = ' '


1921


In [None]:
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    
    # untuk melihat hasilnya silahkan jalankan baris di bawah ini
    # print(term,":" ,term_dict[term])

In [None]:
# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

dataSB['tweet_tokens_stemmed'] = dataSB['tweet_normalized'].swifter.apply(get_stemmed_term)

print(dataSB['tweet_tokens_stemmed'])

Pandas Apply:   0%|          | 0/251 [00:00<?, ?it/s]

0      [the, flu, is, respiratory, virus, that, disap...
1      [disgraceful, cowardly, drmoore, who, has, abs...
2      [do, we, have, genuine, incidence, of, long, c...
3      [woke, the, kids, today, with, the, news, that...
4      [would, be, so, grateful, for, any, support, h...
                             ...                        
246    [learning, loss, will, be, lasting, legacy, of...
247    [you, throw, out, all, these, links, at, me, t...
248    [learning, loss, will, be, lasting, legacy, of...
249    [learning, loss, will, be, lasting, legacy, of...
250    [these, are, the, ppl, who, will, run, india, ...
Name: tweet_tokens_stemmed, Length: 251, dtype: object


In [None]:
#stopwords #2

from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words()


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["tmrw", "tan", "ton", "pt", "komentar", "juta", "unit", "menang", "artikel", 
                       "smartphone", "tagar", "sedia", "kaskus", "seksi", "kl", 'bs'])

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

dataSB['tweet_tokens_stemmed2'] = dataSB['tweet_tokens_stemmed'].apply(stopwords_removal) 

print(dataSB['tweet_tokens_stemmed2'].head())

0    [flu, respiratory, virus, disappears, complete...
1    [disgraceful, cowardly, drmoore, absconded, du...
2    [genuine, incidence, long, covid, children, ye...
3    [woke, kids, today, news, schools, new, zealan...
4    [would, grateful, support, helping, start, sch...
Name: tweet_tokens_stemmed2, dtype: object


In [None]:
for i in range(len(dataSB)):
        a=dataSB.iloc[i][6]
        document.append(a)
        
document[0:5]

['these', 'are', 'the', 'ppl', 'who']

In [None]:
doc_clean = dataSB['tweet_tokens_stemmed2']
doc_clean

0      [flu, respiratory, virus, disappears, complete...
1      [disgraceful, cowardly, drmoore, absconded, du...
2      [genuine, incidence, long, covid, children, ye...
3      [woke, kids, today, news, schools, new, zealan...
4      [would, grateful, support, helping, start, sch...
                             ...                        
246    [learning, loss, lasting, legacy, covid, safet...
247    [throw, links, trying, justify, position, with...
248    [learning, loss, lasting, legacy, covid, safet...
249    [learning, loss, lasting, legacy, covid, safet...
250    [ppl, run, india, get, elected, imagine, misse...
Name: tweet_tokens_stemmed2, Length: 251, dtype: object

# LDA model using gensim
Proses topic modeling dengan LDA gensim

In [None]:
import gensim
from gensim import corpora

dictionary = corpora.Dictionary(doc_clean)
print(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

Dictionary(1787 unique tokens: ['brain', 'completely', 'component', 'covid', 'damage']...)


In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

total_topics = 3 # jumlah topik yang akan di extract
number_words = 10 # jumlah kata per topik

In [None]:
# Running and Trainign LDA model on the document term matrix.
lda_model = Lda(doc_term_matrix, num_topics=total_topics, id2word = dictionary, passes=50)

lda_model.show_topics(num_topics=total_topics, num_words=number_words)

[(0,
  '0.049*"learning" + 0.045*"covid" + 0.043*"loss" + 0.012*"school" + 0.011*"children" + 0.009*"parents" + 0.008*"caused" + 0.007*"kids" + 0.007*"students" + 0.007*"pandemic"'),
 (1,
  '0.041*"learning" + 0.037*"covid" + 0.035*"loss" + 0.013*"education" + 0.012*"school" + 0.006*"pandemic" + 0.005*"schools" + 0.005*"students" + 0.004*"million" + 0.004*"teachers"'),
 (2,
  '0.042*"learning" + 0.042*"covid" + 0.040*"loss" + 0.013*"schools" + 0.011*"school" + 0.010*"students" + 0.009*"years" + 0.008*"kids" + 0.007*"pandemic" + 0.006*"new"')]

In [None]:
# Word Count of Topic Keywords

from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in doc_clean for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df_imp_wcount = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count']) 
print(df_imp_wcount)

         word  topic_id  importance  word_count
0    learning         0    0.048676         291
1       covid         0    0.044855         274
2        loss         0    0.042979         263
3      school         0    0.011532          76
4    children         0    0.010738          39
5     parents         0    0.008598          23
6      caused         0    0.008342          22
7        kids         0    0.006923          37
8    students         0    0.006817          50
9    pandemic         0    0.006665          43
10   learning         1    0.040697         291
11      covid         1    0.036571         274
12       loss         1    0.035143         263
13  education         1    0.012757          43
14     school         1    0.012303          76
15   pandemic         1    0.005667          43
16    schools         1    0.005439          43
17   students         1    0.005107          50
18    million         1    0.004040           9
19   teachers         1    0.004022     

In [None]:
# from google.colab import drive
# drive.mount('drive')

In [None]:
#simpan ke google drive
# df_imp_wcount.to_csv('df_imp_wcount.csv')
# !cp df_imp_wcount.csv "drive/My Drive/Colab Notebooks/LDA Indonesia/"

In [None]:
#jika simpan ke local drive
#filedisimpan='df_imp_wcount.xlsx'
#df_imp_wcount.to_excel(filedisimpan, index = False, header=True)

In [None]:
#Dominant topic and its percentage contribution in each topic
def format_topics_sentences(ldamodel=None, corpus=doc_term_matrix, texts=document):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=doc_term_matrix, texts=doc_clean)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.to_csv('dominant_topic.csv')
print(df_dominant_topic.head(10))

   Document_No  Dominant_Topic  Topic_Perc_Contrib  \
0            0             2.0              0.9689   
1            1             0.0              0.9719   
2            2             2.0              0.9726   
3            3             2.0              0.9616   
4            4             0.0              0.9688   
5            5             1.0              0.9591   
6            6             2.0              0.9610   
7            7             2.0              0.9701   
8            8             0.0              0.9354   
9            9             2.0              0.9631   

                                            Keywords  \
0  learning, covid, loss, schools, school, studen...   
1  learning, covid, loss, school, children, paren...   
2  learning, covid, loss, schools, school, studen...   
3  learning, covid, loss, schools, school, studen...   
4  learning, covid, loss, school, children, paren...   
5  learning, covid, loss, education, school, pand...   
6  learning, 

In [None]:
print(df_dominant_topic)



     Document_No  Dominant_Topic  Topic_Perc_Contrib  \
0              0             2.0              0.9689   
1              1             0.0              0.9719   
2              2             2.0              0.9726   
3              3             2.0              0.9616   
4              4             0.0              0.9688   
..           ...             ...                 ...   
246          246             1.0              0.8944   
247          247             1.0              0.9698   
248          248             1.0              0.8943   
249          249             1.0              0.8944   
250          250             1.0              0.9744   

                                              Keywords  \
0    learning, covid, loss, schools, school, studen...   
1    learning, covid, loss, school, children, paren...   
2    learning, covid, loss, schools, school, studen...   
3    learning, covid, loss, schools, school, studen...   
4    learning, covid, loss, school, c

In [None]:
#simpan ke google drive
# df_dominant_topic.to_csv('df_dominant_topic.csv')
# !cp df_dominant_topic.csv "drive/My Drive/Colab Notebooks/LDA Indonesia/"

In [None]:
# jika simpan ke local drive
# filedisimpan='df_dominant_topic.xlsx'
# df_dominant_topic.to_excel(filedisimpan, index = False, header=True)

In [None]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


In [None]:
import os
LDAvis_data_filepath = os.path.join('ldavis_prepared_'+str(total_topics))

In [None]:
corpus = [dictionary.doc2bow(text) for text in doc_clean]

In [None]:
# proses ini mungkin agak lama
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

In [None]:
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

In [None]:
pyLDAvis.save_html(LDAvis_prepared, '/ldavis_prepared_'+ str(total_topics) +'.html')

In [None]:
# proses ini mungkin agak lama
LDAvis_prepared