In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib.colors as mcolors
import ast
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis

from pprint import pprint
from tqdm import tqdm


from collections import Counter
from translate import Translator
from pysentimiento import create_analyzer
from geneticalgorithm2 import geneticalgorithm2 as ga
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models, similarities
from gensim.models import LdaMulticore, CoherenceModel
from gensim.utils import ClippedCorpus
from nltk.tokenize import word_tokenize
from yellowbrick.cluster.elbow import KElbowVisualizer

import warnings
warnings.filterwarnings("ignore")

In [None]:
unstemmed = pd.read_csv('./data/unstemmed_REVISI.csv')
stemmed = pd.read_csv('./data/stemmed_REVISI.csv')
ndata = pd.read_csv('./data/data_non_duplicate_tweets_REVISI.csv')

In [None]:
# def add_hashtag(hashtags):
#     return ["#" + hashtag for hashtag in ast.literal_eval(hashtags)]

def text_with_hashtag(texts, hashtags):
    return texts + " " + pd.DataFrame([" ".join(ast.literal_eval(x)) for x in hashtags])[0]

In [None]:
# stemmed['hashtags_added'] = stemmed.hashtags.apply(add_hashtag)
stemmed['hashtags_joined'] = stemmed.hashtags.apply(lambda x: " ".join(ast.literal_eval(x)))
# stemmed['hashtags_count'] = stemmed.hashtags.map(len)

# unstemmed['hashtags_added'] = unstemmed.hashtags.apply(add_hashtag)
unstemmed['hashtags_joined'] = unstemmed.hashtags.apply(lambda x: " ".join(ast.literal_eval(x)))
# unstemmed['hashtags_count'] = unstemmed.hashtags.map(len)

In [None]:
unstemmed.dropna(subset=['text'], inplace=True)

In [None]:
len(stemmed), len(unstemmed)

In [None]:
# views = stemmed[['text', 'hashtags_joined', 'text_hashtag', 'hashtags_count', 'users.username']]
views = pd.DataFrame()
views['unstemmed_text'] = unstemmed.text.tolist()
views['stemmed_text'] = stemmed.text.tolist()
views['created_at'] = unstemmed.created_at.tolist()
views['hashtags'] = stemmed.hashtags.tolist()
views['users'] = unstemmed['users.username'].tolist()

##### Sentiment Analysis

In [None]:
sentimentdf = pd.DataFrame()
sentimentdf['text'] = ndata.text

In [None]:
# https://chatbotslife.com/indonesian-twitter-sentiment-analysis-using-pretrained-neural-network-transformer-bert-97ca96a4aa60
pretrained_id = "mdhugol/indonesia-bert-sentiment-classification"
label_id = {'LABEL_0': 'positive', 'LABEL_1': 'neutral', 'LABEL_2': 'negative'}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_id)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_id)
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
statuses = []
scores = []
for st in tqdm(sentimentdf.text):
    sentiment = sentiment_analysis(st)
    status = label_id[sentiment[0]['label']]
    score = sentiment[0]['score']
    statuses.append(status)
    scores.append(score)

In [None]:
sentimentdf['sentiment'] = statuses
sentimentdf['sentiment_scores'] = scores

In [None]:
sentimentdf.sentiment.value_counts()

##### Emotion Analysis

In [None]:
from transformers import pipeline
pretrained_name = "StevenLimcorn/indonesian-roberta-base-emotion-classifier"
nlp = pipeline(
    "sentiment-analysis",
    model=pretrained_name,
    tokenizer=pretrained_name
)

In [None]:
emotions = []
emotions_scores = []
for st in tqdm(sentimentdf.text):
    emotion = nlp(st)[0]['label']
    score = nlp(st)[0]['score']
    emotions_scores.append(score)
    emotions.append(emotion)

In [None]:
sentimentdf['emotions'] = emotions
sentimentdf['emotions_scores'] = emotions_scores

In [None]:
sentimentdf.emotions.value_counts()

In [None]:
sentimentdf.head(3)

In [None]:
sentimentdf.to_csv('sentimentdf.csv')

In [None]:
sentimentdf = pd.read_csv('sentimentdf.csv')

In [None]:
sentimentdf.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
sentimentdf.emotions.value_counts()

In [None]:
sentimentdf.sentiment.value_counts()

In [None]:
sentimentdf.head()

//

In [None]:
views.head()

# TF-IDF

In [None]:
def generate_tfIdf(array):
    max_features = len(array)

    # calc TF vector
    cvect = CountVectorizer(max_features=max_features)
    TF_vector = cvect.fit_transform(array)

    # normalize TF vector
    normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

    # calc IDF
    tfidf = TfidfVectorizer(max_features=max_features, smooth_idf=False)
    tfs = tfidf.fit_transform(array)
    IDF_vector = tfidf.idf_

    # hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
    tfidf_mat = normalized_TF_vector.multiply(IDF_vector).toarray()
    
    terms = tfidf.get_feature_names_out()

    # sum tfidf frequency of each term through documents
    sums = tfidf_mat.sum(axis=0)

    # connecting term to its sums frequency
    data = []
    for col, term in enumerate(terms):
        data.append((term, np.round(sums[col]) ))

    ranking = pd.DataFrame(data, columns=['term','rank'])
    ranking.sort_values('rank', ascending=False, inplace=True)
    ranking.reset_index(drop=True, inplace=True)
    
    return tfs, terms, tfidf, tfidf_mat, ranking

In [None]:
unstemmed_text_tfs, unstemmed_text_terms, unstemmed_text_tfidf, unstemmed_text_tfidf_mat, unstemmed_text_ranking = generate_tfIdf(list(views.unstemmed_text))

In [None]:
stemmed_text_tfs, stemmed_text_terms, stemmed_text_tfidf, stemmed_text_tfidf_mat, stemmed_text_ranking = generate_tfIdf(list(views.stemmed_text))

In [None]:
hashtag_tfs, hashtag_terms, hashtag_tfidf, hashtag_tfidf_mat, hashtag_ranking = generate_tfIdf(list(views.hashtags))

In [None]:
stemmed_text_dist = 1 - cosine_similarity(stemmed_text_tfidf_mat)

In [None]:
unstemmed_text_dist = 1 - cosine_similarity(unstemmed_text_tfidf_mat)

In [None]:
hastag_dist = 1 - cosine_similarity(hashtag_tfidf_mat)

##### TF-IDF for Hashtags

In [None]:
# small sample

# TODO https://smyachenkov.com/posts/categorizing-instagram-tags-with-k-means/

##### Hashtags with the most appearances

In [None]:
hashtag_ranking[:50]

# Hashtags K-Means Clustering

In [None]:
# elbow method to define cluster

In [None]:
model = KMeans(random_state=42, n_init=1, init='k-means++', verbose=True, max_iter=5)
visualizer = KElbowVisualizer(model, k=(2,15))

visualizer.fit(hashtag_tfs.toarray())        # Fit the data to the visualizer
visualizer.show()

In [None]:
num_cluster_hashtag = 3
km_hashtag = KMeans(num_cluster_hashtag, random_state=123, n_init=1, init='k-means++', verbose=True, max_iter=5)
km_hashtag.fit(hashtag_tfs)
y_km = km_hashtag.predict(hashtag_tfs)

In [None]:
hashtag_clusters = km_hashtag.labels_.tolist()

In [None]:
views['hashtag_clusters'] = hashtag_clusters

In [None]:
hashtag_feature_name = hashtag_tfidf.get_feature_names_out()
hashtag_top_features = 30
hashtag_ordered_centroid = km_hashtag.cluster_centers_.argsort()[:,::-1]

hashtag_clusters = []
hashtag_key_features = []
final_hashtags = []
for cluster in range(num_cluster_hashtag):
    hashtag_key_feature = [hashtag_feature_name[index] for index in hashtag_ordered_centroid[cluster,:hashtag_top_features]]
    hashtag_cluster = views[views['hashtag_clusters']==cluster]['hashtags'].values.tolist()
    hashtag_clusters.append(str(cluster+1))
    hashtag_key_features.append(hashtag_key_feature)
    final_hashtags.append(hashtag_cluster)

In [None]:
final_hashtags_count = [len(ht) for ht in final_hashtags] 
final_hashtags_clusters = []
final_hashtags_key_features = []
for x, cluster in enumerate(hashtag_clusters):
    for count in range(final_hashtags_count[x]):
        final_hashtags_clusters.append(cluster)
for y, key in enumerate(hashtag_key_features):
    for count in range(final_hashtags_count[y]):
        final_hashtags_key_features.append(key)
final_hashtags_1 = []
for hashtag in final_hashtags:
    for ht in hashtag:
        final_hashtags_1.append(ht)

In [None]:
print("Tweets count each cluster: \n")
for i in range(len(hashtag_clusters)):
    print(f"Cluster {i+1}: {final_hashtags_count[i]}") 

In [None]:
results = pd.DataFrame([final_hashtags_clusters, final_hashtags_key_features, final_hashtags_1]).T

In [None]:
results.columns = ['cluster', 'key_features', 'hashtag']

In [None]:
results.key_features.apply(str).unique()

In [None]:
results_2 = [ast.literal_eval(tr2) for tr2 in results.key_features.apply(str).unique().tolist()]

In [None]:
# results_2[1].pop(1)
# results_2[3].pop(0)
# results_2[3].pop(1)

In [None]:
for i, c in enumerate([" ".join(joined) for joined in results_2]):
    print(f"Cluster {i+1}: {c}")

# Text Topic Modelling using LDA

In [None]:
tmdata = pd.read_csv('./data/nddataTopicModelling.csv')

In [None]:
tmdata.dropna(subset=['text'], inplace=True)

In [None]:
tokenized_text = [word_tokenize(text) for text in tmdata.text]

In [None]:
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(tokenized_text)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

##### Baseline model

In [None]:
# Build LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         num_topics=5, 
                         random_state=100,
                         chunksize=100,
                         passes=10,
                         per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_text, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

##### Hyperparameter tuning
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0


In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             num_topics=k, 
                             random_state=100,
                             chunksize=100,
                             passes=10,
                             alpha=a,
                             eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_text, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
# grid = {}
# grid['Validation_Set'] = {}

# # Topics range
# min_topics = 2
# max_topics = 11
# step_size = 1
# topics_range = range(min_topics, max_topics, step_size)

# # Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append('symmetric')
# alpha.append('asymmetric')

# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append('symmetric')

# # Validation sets
# num_of_docs = len(corpus)
# corpus_sets = [ClippedCorpus(corpus, int(num_of_docs*0.75)), 
#                corpus]

# corpus_title = ['75% Corpus', '100% Corpus']

# model_results = {'Validation_Set': [],
#                  'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence': []
#                 }

# # Can take a long time to run
# if 1 == 1:
#     pbar = tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
#     # iterate through validation corpuses
#     for i in range(len(corpus_sets)):
#         # iterate through number of topics
#         for k in topics_range:
#             # iterate through alpha values
#             for a in alpha:
#                 # iterare through beta values
#                 for b in beta:
#                     # get the coherence score for the given parameters
#                     cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, 
#                                                   k=k, a=a, b=b)
#                     # Save the model results
#                     model_results['Validation_Set'].append(corpus_title[i])
#                     model_results['Topics'].append(k)
#                     model_results['Alpha'].append(a)
#                     model_results['Beta'].append(b)
#                     model_results['Coherence'].append(cv)
                    
#                     pbar.update(1)

In [None]:
    # pd.DataFrame(model_results).to_csv('./data/lda_tuning_results.csv', index=False)
    # pbar.close()

In [None]:
df_topics = pd.read_csv('./data/lda_tuning_results.csv')

In [None]:
coh_values = [df_topics[df_topics.Topics == value].Coherence.values[0] for value in df_topics["Topics"].unique().tolist()]
df_num_topics = df_topics.Topics.unique().tolist()

In [None]:
fig, ax = plt.subplots()
ax.plot(df_num_topics, coh_values)

ax.set(xlabel='Alpha', ylabel='Coherence',
       title='Topic Coherence: Determining optimal number of topics')
ax.grid()

# fig.savefig("topic_coherence.png")
plt.show()

In [None]:
max(df_topics[df_topics.Topics == 4].Coherence)

In [None]:
df_topics[df_topics.Topics == 4] #alpha = 0.01, beta=0.61
#alpha = 0.0909, beta= 0.61

##### Final model

In [None]:
num_topics=4
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics, 
                         random_state=100,
                         chunksize=100,
                         passes=10,
                         alpha=0.01,
                         eta=0.61)

In [None]:
# topics_matrix
topics_matrix = lda_model.show_topics(formatted=False, num_words=30)
topics_matrix = np.array(topics_matrix, dtype=object)

topic_words = topics_matrix[:,:]

In [None]:
pprint(lda_model.print_topics())

In [None]:
for i in topic_words:
    print([str(word) for word in i])
    print()

##### What is the Dominant topic and its percentage contribution in each document

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in tqdm(enumerate(ldamodel[corpus])):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, tokenized_text)

In [None]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(20)

##### The most representative sentence for each topic

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet

##### Frequency Distribution of Word Counts in Documents

In [None]:
doc_lens = [len(d) for d in df_dominant_topic.Text]

# Plot
plt.figure(figsize=(16,7), dpi=160)
plt.hist(doc_lens, bins = 20, color='navy')
plt.text(30, 10000, "Mean   : " + str(round(np.mean(doc_lens))))
plt.text(30, 9000, "Median : " + str(round(np.median(doc_lens))))
plt.text(30, 8000, "Stdev   : " + str(round(np.std(doc_lens))))
plt.text(30, 7000, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(30, 6000, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))

plt.gca().set(xlim=(0, 40), ylabel='Number of Documents', xlabel='Document Word Count')
plt.tick_params(size=16)
plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
plt.show()

##### Word Counts of Topic Keywords

In [None]:
temptopics = lda_model.show_topics(formatted=False, num_words=30)

In [None]:
topic1 = [temptopics[0][1][0], temptopics[0][1][1], ('uuciptakerja', 0.008051984), temptopics[0][1][3], 
         temptopics[0][1][4], temptopics[0][1][6], temptopics[0][1][7], temptopics[0][1][8], temptopics[0][1][9], temptopics[0][1][13]]
topic2 = [temptopics[1][1][0], temptopics[1][1][2], ('phk', 0.0132331895), temptopics[1][1][7], temptopics[1][1][8], temptopics[1][1][11],
         temptopics[1][1][14], temptopics[1][1][16], temptopics[1][1][20], temptopics[1][1][22]]
topic3 = [temptopics[2][1][0], temptopics[2][1][1], temptopics[2][1][2], temptopics[2][1][4], temptopics[2][1][5], temptopics[2][1][6],
         temptopics[2][1][7], temptopics[2][1][8], temptopics[2][1][14], temptopics[2][1][19]]
topic4 = [temptopics[3][1][0], temptopics[3][1][1], temptopics[3][1][4], temptopics[3][1][6], temptopics[3][1][7], temptopics[3][1][9],
         temptopics[3][1][10], temptopics[3][1][13], temptopics[3][1][17], temptopics[3][1][28]]

In [None]:
temptopics = [(0,
  topic1),
 (1,
  topic2),
 (2,
  topic3),
 (3,
  topic4)]

In [None]:
# topics = lda_model.show_topics(formatted=False)
topics = [(0,
  topic1),
 (1,
  topic2),
 (2,
  topic3),
 (3,
  topic4)]
data_flat = [w for w_list in tokenized_text for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax.set_title('Topic: ' + str(i+1), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

##### pyLDAvis

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('./results/ldavis_tuned_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_tuned_'+ str(num_topics) +'.html')

LDAvis_prepared