# Prepare the library

In [1]:
import pandas as pd
import numpy as np
import re
import glob
from unicodedata import normalize
import xgboost as xgb

from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Load dataset

In [4]:
csv_data = glob.glob("./dataset" + "/*.xlsx")
csv_list = (pd.read_excel(i, usecols=['Tweet', 'Topic']) for i in csv_data)
data = pd.concat(csv_list, ignore_index=True)

In [5]:
data.Topic.value_counts()

booster               1000
vaksin                1000
omicron               1000
protokol kesehatan    1000
test covid            1000
ppkm_kemendagri       1000
Name: Topic, dtype: int64

In [None]:
#data.to_csv("/content/drive/MyDrive/kemkes_dataset/all_tweet_dataset_kemkes.csv")

# Cleansing tweet

In [6]:
%%time
#remove emoji
def clean_emot(tweet):
  emoji = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002500-\U00002BEF"  # chinese char
                        u"\U00002702-\U000027B0"
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        u"\U0001f926-\U0001f937"
                        u"\U00010000-\U0010ffff"
                        u"\u2640-\u2642"
                        u"\u2600-\u2B55"
                        u"\u200d"
                        u"\u23cf"
                        u"\u23e9"
                        u"\u231a"
                        u"\ufe0f"  # dingbats
                        u"\u3030"
                        "]+", flags=re.UNICODE)
  tweet = re.sub(emoji,'', tweet)
  return tweet

def clean_text(tweet):
  #remove non ascii
  tweet = re.sub(r'[^\x00-\x7f]', ' ', tweet)
  #remove punctuation
  tweet = re.sub('[^a-zA-Z]', ' ', tweet)
  #remove digit + special char
  tweet = re.sub('(\\d|\\W)+', ' ', tweet)
   
  #Clean links if any
  link_pattern1 = re.compile(r'http(s)?\:\/\/[a-zA-Z.\/0-9#]+')
  tweet         = re.sub(link_pattern1, '', tweet)

  #Clean links if any - 2
  link_pattern2 = re.compile(r'(www)\.[a-zA-Z0-9\/\\.]+\.(com|co.id|net)?')
  tweet         = re.sub(link_pattern2, '', tweet)    

  #Space
  #Multiple Space
  tweet = re.sub(r'(\s)\1', r'\1', tweet)

  #Space at the beginning of sentences
  tweet = tweet.lstrip()
  tweet = tweet.rstrip()
  tweet = tweet.strip()
  tweet = tweet.lower()
  return tweet

def clean_stopwords(tweet):
  txt_file = open('./dataset/combined_stop_words.txt', 'r')
  stopword = txt_file.read()
  stopwords = stopword.split('\n')
  txt_file.close()

  #tokenize text
  word = word_tokenize(tweet)
  clean_words = [w for w in word if w not in stopwords]
  clean_words = []

  for w in word:
    if w not in stopword:
      clean_words.append(w)
  return ' '.join(clean_words)

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 12.9 µs


In [10]:
data['Tweet'] = data['Tweet'].astype("str")

In [11]:
%%time
data['Tweet'] = data['Tweet'].apply(lambda x: clean_emot(x))
data['Tweet'] = data['Tweet'].apply(lambda x: clean_text(x))
data['Tweet'] = data['Tweet'].apply(lambda x: clean_stopwords(x))

CPU times: user 6.3 s, sys: 650 ms, total: 6.95 s
Wall time: 15.4 s


In [12]:
data['Tweet']

0                        vaksin booster sakit banget gasi
1       bgr salty tp ngerasa booster gampang sakit bad...
2                                        gyudouye booster
3          firmacil iyaa mah gapunya mood booster pribadi
4          mudik pelabuhan sungai duku diwajibkan booster
                              ...                        
5995    ketua dewan perwakilan rakyat dpr puan maharan...
5996    ketua dpr puan maharani pemerintah membuktikan...
5997    puan pembuktian meyakinkan publik ragu menggun...
5998    dapet whatsapp temen australia indonesia pagi ...
5999    nrlnnn selamat pagi mohon maaf terbalaskan pad...
Name: Tweet, Length: 6000, dtype: object

# Feature Extraction text using tf-idf vectorizer

In [13]:
list(data['Tweet'])

['vaksin booster sakit banget gasi',
 'bgr salty tp ngerasa booster gampang sakit badan kek imunnya gampang lemah perasaan ngalamin',
 'gyudouye booster',
 'firmacil iyaa mah gapunya mood booster pribadi',
 'mudik pelabuhan sungai duku diwajibkan booster',
 'bilang maaf menghindar kalo paksaan videokan sahaja simple that berlakukan putusan paksaan vaksin nyali ngela',
 'syiar kapolres situbondo akbp dr andi sinjaya memghimbau takmir masjid jamaah masyarakat sukseskan program juta vaksinasi booster dukung mudik sehat divisihumaspolri humaspoldajatim',
 'kai punten kereta jarak vaksin booster thank you',
 'funisphysics selamat malam kak penumpang vaksin booster jarak wajib rapid test antigen pcr informasi lengkap syarat ketentuan perjalanan silakan cek tautan trims',
 'abis booster sakit',
 'arkian widi senengnyaa mudik sabar nih syarat booster',
 'cuyfess klo udh vaksin booster kyknya nder tpi klo sinovac pake pcr cmiiw',
 'chimoowy maaciii kak mood booster kahh',
 'dpt booster menggigi

In [28]:
def extract_text(tweet_train, tweet_test):
  vec = TfidfVectorizer(analyzer="word", ngram_range=(1,1))
  vec.fit(tweet_train)
  x = vec.transform(tweet_train).toarray()
  y = vec.transform(tweet_test).toarray()
  print(x.shape)
  print(y.shape)
  return x, y

#apply
list_tweet_train = list(data['Tweet'])

# create data test
data_test = ['abis vaksin gue langsung meriang, gue gamau di vaksin lagi']
list_tweet_test = list(data_test)

# Simple similarity sentence using cosine similarity

In [29]:
%%time
train_matrix, test_matrix = extract_text(list_tweet_train, list_tweet_test)
similarity= cosine_similarity(test_matrix, train_matrix)

(6000, 9990)
(1, 9990)
CPU times: user 648 ms, sys: 33.7 ms, total: 682 ms
Wall time: 686 ms


In [30]:
%%time
index_ = similarity.reshape(-1)
# finding index of 5 tweets that similar with data test
result = sorted(range(len(index_)), key=lambda x: index_[x])[-5:]

# print top 5 sentence that similar to data test
for i in result:
  print(data['Tweet'][i])

gue besok antigen
abis booster pusing bgt otw meriang lg
gue sampe capek ceritainnya astra gue karantina gue lantai malem malemnya gue kebangun denger grusak grusukin sampah gue cek paginya sampahnya rapi berserakan sedikitpun
gue blm vaksin booster
gamau swab
CPU times: user 10.6 ms, sys: 897 µs, total: 11.5 ms
Wall time: 10.7 ms
