In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib.colors as mcolors
import ast
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis

from pprint import pprint
from tqdm import tqdm


from collections import Counter
from translate import Translator
from pysentimiento import create_analyzer
from geneticalgorithm2 import geneticalgorithm2 as ga
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models, similarities
from gensim.models import LdaMulticore, CoherenceModel
from gensim.utils import ClippedCorpus
from nltk.tokenize import word_tokenize
from yellowbrick.cluster.elbow import KElbowVisualizer

import warnings
warnings.filterwarnings("ignore")

In [None]:
unstemmed = pd.read_csv('./data/unstemmed_REVISI.csv')
stemmed = pd.read_csv('./data/stemmed_REVISI.csv')
ndata = pd.read_csv('./data/data_non_duplicate_tweets_REVISI.csv')

In [None]:
# def add_hashtag(hashtags):
#     return ["#" + hashtag for hashtag in ast.literal_eval(hashtags)]

def text_with_hashtag(texts, hashtags):
    return texts + " " + pd.DataFrame([" ".join(ast.literal_eval(x)) for x in hashtags])[0]

In [None]:
# stemmed['hashtags_added'] = stemmed.hashtags.apply(add_hashtag)
stemmed['hashtags_joined'] = stemmed.hashtags.apply(lambda x: " ".join(ast.literal_eval(x)))
# stemmed['hashtags_count'] = stemmed.hashtags.map(len)

# unstemmed['hashtags_added'] = unstemmed.hashtags.apply(add_hashtag)
unstemmed['hashtags_joined'] = unstemmed.hashtags.apply(lambda x: " ".join(ast.literal_eval(x)))
# unstemmed['hashtags_count'] = unstemmed.hashtags.map(len)

In [None]:
unstemmed.dropna(subset=['text'], inplace=True)

In [None]:
len(stemmed), len(unstemmed)

In [None]:
# views = stemmed[['text', 'hashtags_joined', 'text_hashtag', 'hashtags_count', 'users.username']]
views = pd.DataFrame()
views['unstemmed_text'] = unstemmed.text.tolist()
views['stemmed_text'] = stemmed.text.tolist()
views['created_at'] = unstemmed.created_at.tolist()
views['hashtags'] = stemmed.hashtags.tolist()
views['users'] = unstemmed['users.username'].tolist()

# TF-IDF

In [None]:
def generate_tfIdf(array):
    max_features = len(array)

    # calc TF vector
    cvect = CountVectorizer(max_features=max_features)
    TF_vector = cvect.fit_transform(array)

    # normalize TF vector
    normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

    # calc IDF
    tfidf = TfidfVectorizer(max_features=max_features, smooth_idf=False)
    tfs = tfidf.fit_transform(array)
    IDF_vector = tfidf.idf_

    # hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
    tfidf_mat = normalized_TF_vector.multiply(IDF_vector).toarray()
    
    terms = tfidf.get_feature_names_out()

    # sum tfidf frequency of each term through documents
    sums = tfidf_mat.sum(axis=0)

    # connecting term to its sums frequency
    data = []
    for col, term in enumerate(terms):
        data.append((term, np.round(sums[col]) ))

    ranking = pd.DataFrame(data, columns=['term','rank'])
    ranking.sort_values('rank', ascending=False, inplace=True)
    ranking.reset_index(drop=True, inplace=True)
    
    return tfs, terms, tfidf, tfidf_mat, ranking

In [None]:
unstemmed_text_tfs, unstemmed_text_terms, unstemmed_text_tfidf, unstemmed_text_tfidf_mat, unstemmed_text_ranking = generate_tfIdf(list(views.unstemmed_text))

In [None]:
stemmed_text_tfs, stemmed_text_terms, stemmed_text_tfidf, stemmed_text_tfidf_mat, stemmed_text_ranking = generate_tfIdf(list(views.stemmed_text))

In [None]:
hashtag_tfs, hashtag_terms, hashtag_tfidf, hashtag_tfidf_mat, hashtag_ranking = generate_tfIdf(list(views.hashtags))

In [None]:
stemmed_text_dist = 1 - cosine_similarity(stemmed_text_tfidf_mat)

In [None]:
unstemmed_text_dist = 1 - cosine_similarity(unstemmed_text_tfidf_mat)

In [None]:
hastag_dist = 1 - cosine_similarity(hashtag_tfidf_mat)

##### TF-IDF for Hashtags

In [None]:
# small sample

# TODO https://smyachenkov.com/posts/categorizing-instagram-tags-with-k-means/

##### Hashtags with the most appearances

In [None]:
hashtag_ranking[:50]

# Hashtags K-Means Clustering

In [None]:
# elbow method to define cluster

In [None]:
tfs2 = hashtag_tfs.copy()
valData = pd.DataFrame(hashtag_tfs.copy().toarray())

In [None]:
kmeans_model = KMeans(3, random_state=123, n_init=1, init='k-means++', verbose=True, max_iter=5).fit(tfs2)
labels = kmeans_model.labels_
labels = labels.tolist()
valData['cluster'] = labels
valData.columns = valData.columns.astype(str)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(valData.drop("cluster", axis=1), valData["cluster"], test_size=0.05, random_state=47)

In [None]:
y_train.value_counts()

In [None]:
from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(n_neighbors=2, metric='euclidean')
neigh.fit(valData.drop("cluster",axis=1))

valDataFeature = valData.drop("cluster",axis=1)
valData["id"] = [i for i in range(valData.shape[0])]

indexList=[neigh.kneighbors(valDataFeature[valDataFeature.index.isin([index])])[1][0][1] for index in X_test.index]
P=[valData[valData["id"]==indx]["cluster"].values[0] for indx in indexList]

results = pd.DataFrame(list(zip(labels, P)),
               columns =['Predicted_S', 'True_P'])

In [None]:
from sklearn import metrics
import seaborn as sns

In [None]:
metrics.accuracy_score(results["True_P"].values, results["Predicted_S"].values)

In [None]:
results['Predicted_S'].value_counts()

In [None]:
print(metrics.classification_report(results["True_P"].values, results["Predicted_S"].values))
cm = metrics.confusion_matrix(results["True_P"].values, results["Predicted_S"].values)
plt.figure(figsize=(14,10))
sns.heatmap(cm, annot=True)

In [None]:
num_cluster_hashtag = 3
km_hashtag = KMeans(num_cluster_hashtag, random_state=123, n_init=1, init='k-means++', verbose=True, max_iter=5)
km_hashtag.fit(hashtag_tfs)
y_km = km_hashtag.predict(hashtag_tfs)

In [None]:
hashtag_clusters = km_hashtag.labels_.tolist()

In [None]:
views['hashtag_clusters'] = hashtag_clusters

In [None]:
hashtag_feature_name = hashtag_tfidf.get_feature_names_out()
hashtag_top_features = 30
hashtag_ordered_centroid = km_hashtag.cluster_centers_.argsort()[:,::-1]

hashtag_clusters = []
hashtag_key_features = []
final_hashtags = []
for cluster in range(num_cluster_hashtag):
    hashtag_key_feature = [hashtag_feature_name[index] for index in hashtag_ordered_centroid[cluster,:hashtag_top_features]]
    hashtag_cluster = views[views['hashtag_clusters']==cluster]['hashtags'].values.tolist()
    hashtag_clusters.append(str(cluster+1))
    hashtag_key_features.append(hashtag_key_feature)
    final_hashtags.append(hashtag_cluster)

In [None]:
final_hashtags_count = [len(ht) for ht in final_hashtags] 
final_hashtags_clusters = []
final_hashtags_key_features = []
for x, cluster in enumerate(hashtag_clusters):
    for count in range(final_hashtags_count[x]):
        final_hashtags_clusters.append(cluster)
for y, key in enumerate(hashtag_key_features):
    for count in range(final_hashtags_count[y]):
        final_hashtags_key_features.append(key)
final_hashtags_1 = []
for hashtag in final_hashtags:
    for ht in hashtag:
        final_hashtags_1.append(ht)

In [None]:
print("Tweets count each cluster: \n")
for i in range(len(hashtag_clusters)):
    print(f"Cluster {i+1}: {final_hashtags_count[i]}") 

In [None]:
results = pd.DataFrame([final_hashtags_clusters, final_hashtags_key_features, final_hashtags_1]).T

In [None]:
results.columns = ['cluster', 'key_features', 'hashtag']

In [None]:
results.key_features.apply(str).unique()

In [None]:
results_2 = [ast.literal_eval(tr2) for tr2 in results.key_features.apply(str).unique().tolist()]

In [None]:
# results_2[1].pop(1)
# results_2[3].pop(0)
# results_2[3].pop(1)

In [None]:
for i, c in enumerate([" ".join(joined) for joined in results_2]):
    print(f"Cluster {i+1}: {c}")