In [1]:
!nvidia-smi

Tue May 30 11:35:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.94       Driver Version: 516.94       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| 45%   32C    P8    N/A /  75W |   1221MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# Import Libary
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from matplotlib.ticker import MaxNLocator

import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from wordcloud import WordCloud

import gensim
from gensim import models
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases, CoherenceModel
import gensim.corpora as corpora

from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [3]:
# Create the pandas DataFrame
df = pd.DataFrame([ 
                   # ['Cukup bagus dan Tempatnya tertata rapi Karena dibuatkan Tempat untuk berfoto'],
                   ['lokasi curug indah sejuk baik kondisi jalan sempit becek hujan'],
                   ['saran hujan jalan curug licin becek ojek antar curug'],
                   ['tempat bagus akses curug jalan tanah hujan anjur'],
                   # ['Overall is good, cuma Lokasinya jauh.. Fasilitasnya masih harus dilengkapi lagi..'],
                   # ['Jalan menuju kesini sekarang sudah manusiawi. Jangan ragu buat bawa motor atau mobil'],
                   # ['Sangat bagus sekali alam air terjunnya, cuma sayang jarak dari tempat saya sangat jauh sekali kurang lebih 185 km'],
                   # ['Capenya perjalanan terbayarkan dg keindahan dan ke eksotisannya'],
                   # ['Sangat luar biasa ini tempat, Asri, sejuk dan wah dah. Bersama istri tercinta jalan2 ke sini.'],
                   # ['Tempatnya bagus, alam banget, buat camping juga cocok ada sungainya'],
                   # ['Perjuangan banget pokonya buat smpe ke curugnya, tp semua terbayarkan 😊 …'],
                   # ['curug nya banyak bisa ambil beberapa foto deh mantap'],
                   # ['Tempatnya bagus, cuman kurang petunjuk lokasi wisatanya'],
                  ], columns=['text'])

In [4]:
df['tokens'] = df['text'].apply(lambda x: word_tokenize(x))

In [5]:
texts = df['tokens']

In [6]:
# Initiate Corpus
id2word = corpora.Dictionary(texts)
# id2word.filter_extremes(no_below=5, no_above=0.2)

# texts = df['tokens']
# corpus = [id2word.doc2bow(text) for text in texts]

In [7]:
# BoW Corpus
corpus = [id2word.doc2bow(text) for text in texts]

In [8]:
# TF-IDF Corpus
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [11]:
corpus_tfidf[0]

[(0, 0.40369167389095173),
 (1, 0.1489905855640844),
 (4, 0.40369167389095173),
 (6, 0.40369167389095173),
 (7, 0.40369167389095173),
 (8, 0.40369167389095173),
 (9, 0.40369167389095173)]

In [10]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1)],
 [(1, 1), (2, 2), (3, 1), (5, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(2, 1), (3, 1), (5, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]]

In [11]:
coherence_values = 0

model = LdaModel( #LdaModel
            corpus=corpus, id2word=id2word, num_topics=2, random_state=42, iterations=1, alpha=1, eta=1
            # chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, 
            # decay=0.5, offset=1.0, eval_every=10, gamma_threshold=0.001
            )

coherence_model = CoherenceModel(
                    model=model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_values = coherence_model.get_coherence()

In [12]:
print(coherence_values)

0.2799258811857321


In [13]:
x = model.show_topics(num_topics=2, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

In [14]:
# Below Code Prints Topics and Words
for topic, words in topics_words:
    print(f'Topic {topic}: {words}')

Topic 0: ['curug', 'hujan', 'jalan', 'becek', 'ojek', 'saran', 'licin', 'baik', 'indah', 'tanah']
Topic 1: ['curug', 'jalan', 'hujan', 'becek', 'sempit', 'bagus', 'kondisi', 'akses', 'antar', 'anjur']


In [15]:
# Below Code Prints Topics and Words with probability
for idx, topic in model.print_topics(-1):
    print(f'Topic: {idx}, Word: {topic}')

Topic: 0, Word: 0.098*"curug" + 0.080*"hujan" + 0.076*"jalan" + 0.059*"becek" + 0.049*"ojek" + 0.048*"saran" + 0.047*"licin" + 0.047*"baik" + 0.047*"indah" + 0.046*"tanah"
Topic: 1, Word: 0.087*"curug" + 0.078*"jalan" + 0.073*"hujan" + 0.064*"becek" + 0.049*"sempit" + 0.048*"bagus" + 0.048*"kondisi" + 0.048*"akses" + 0.048*"antar" + 0.048*"anjur"


In [None]:
D = list(df['tokens'])

In [None]:
from random import randint

D_topics = []
for i in range(len(D)):
    d_topic = []
    for j in range(len(D[i])):
        d_topic.append(randint(1,2))
    D_topics.append(d_topic)

In [None]:
D

In [None]:
D_topics

In [None]:
w_all = []
for i in range(len(D)):
    for j in range(len(D[i])):
        w_all.append(D[i][j])

w_all = list(dict.fromkeys(w_all))

In [None]:
# Create the pandas DataFrame
prob_word_each_topic = pd.DataFrame([[word] for word in w_all], columns=['text'])

In [None]:
topic_all = []
for word_idx in range(len(prob_word_each_topic)):
    topic_1 = 0
    topic_2 = 0
    # topic_3 = 0
    for doc in range(len(D)):
        for word_in_doc in range(len(D[doc])):
            if prob_word_each_topic['text'][word_idx] == D[doc][word_in_doc]:
                if D_topics[doc][word_in_doc] == 1:
                    topic_1 += 1
                elif D_topics[doc][word_in_doc] == 2:
                    topic_2 += 1
                # elif D_topics[doc][word_in_doc] == 3:
                #     topic_3 += 1
    topic_all.append([topic_1, topic_2])

In [None]:
# prob_word_each_topic_2 = prob_word_each_topic.append(topic_all, columns=['topic_1', 'topic_2'])

In [None]:
topic_all

In [None]:
topic_1_sum = 0
topic_2_sum = 0
for value_topic in topic_all:
    topic_1_sum += value_topic[0]
    topic_2_sum += value_topic[1]

In [None]:
topic_1_sum, topic_2_sum

In [None]:
array1 = np.array([[word] for word in w_all])
array2 = np.array(topic_all)
joined = np.concatenate((array1, array2), axis=1)

prob_word_each_topic = pd.DataFrame(joined, columns=['text', 'topic_1', 'topic_2'])

In [None]:
prob_word_each_topic

In [None]:
prob_topic_each_word = []
for d in D_topics:
    topic_1 = 0
    topic_2 = 0
    for topic_value_idx in range(len(d)):
        if d[topic_value_idx] == 1:
            topic_1 += 1
        elif d[topic_value_idx] == 2:
            topic_2 += 1
    prob_topic_each_word.append([topic_1, topic_2])

In [None]:
prob_topic_each_word = pd.DataFrame(prob_topic_each_word, columns=['topic_1', 'topic_2'])
prob_topic_each_word

In [None]:
rawdocs = [
    'akses mudah tempat indah pisan seperti curug',
    'ada fasilitas outbound paintball pegawai cukup ramah',
    'tempat enak buat hiking harga makanan cukup terjangkau',
    'tempat bagus cocok buat healing keluarga banyak spot foto',
    'bagus untuk camping dan melihat sunrise tarif relatif murah',
]

In [None]:
docs = [doc.split(' ') for doc in rawdocs]

In [None]:
# unique words
from itertools import chain

vocabs = list(dict.fromkeys(chain.from_iterable(docs)))

In [None]:
# replace words in documents with wordIDs
docs_idx = []
for i in range(len(docs)):
    doc_idx = []
    for j in range(len(docs[i])):
        for vocab_idx in range(len(vocabs)):
            if vocabs[vocab_idx] == docs[i][j]:
                doc_idx.append(vocab_idx)
    docs_idx.append(doc_idx)

In [None]:
docs_idx

# TFIDF

In [14]:
import pandas as pd
import numpy as np

In [15]:
corpus = [
    'lokasi curug indah sejuk baik kondisi jalan sempit becek hujan',
    'saran hujan jalan curug licin becek ojek antar curug',
    'tempat bagus akses curug jalan tanah hujan anjur',
]

In [16]:
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

Number of words in the corpus: 19
The words in the corpus: 
 {'antar', 'akses', 'sejuk', 'curug', 'bagus', 'saran', 'ojek', 'anjur', 'hujan', 'sempit', 'tanah', 'indah', 'jalan', 'licin', 'tempat', 'becek', 'lokasi', 'kondisi', 'baik'}


In [18]:
n_docs = len(corpus)         #·Number of documents in the corpus
n_words_set = len(words_set) #·Number of unique words in the 

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)
# print(df_tf)
# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
df_tf

Unnamed: 0,antar,akses,sejuk,curug,bagus,saran,ojek,anjur,hujan,sempit,tanah,indah,jalan,licin,tempat,becek,lokasi,kondisi,baik
0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.1,0.1,0.0,0.0,0.1,0.1,0.1,0.1
1,0.111111,0.0,0.0,0.222222,0.0,0.111111,0.111111,0.0,0.111111,0.0,0.0,0.0,0.111111,0.111111,0.0,0.111111,0.0,0.0,0.0
2,0.0,0.125,0.0,0.125,0.125,0.0,0.0,0.125,0.125,0.0,0.125,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0


In [21]:
print("IDF of: ")

idf = {}

for w in words_set:
    k = 0    # number of documents in the corpus that contain this word
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k)
    
    print(f'{w:>15}: {idf[w]:>10}' )

IDF of: 
        kondisi: 0.47712125471966244
           baik: 0.47712125471966244
          jalan:        0.0
          becek: 0.17609125905568124
         sempit: 0.47712125471966244
          anjur: 0.47712125471966244
          akses: 0.47712125471966244
          licin: 0.47712125471966244
          curug:        0.0
          antar: 0.47712125471966244
           ojek: 0.47712125471966244
          indah: 0.47712125471966244
          sejuk: 0.47712125471966244
         tempat: 0.47712125471966244
          hujan:        0.0
         lokasi: 0.47712125471966244
          saran: 0.47712125471966244
          tanah: 0.47712125471966244
          bagus: 0.47712125471966244


In [22]:
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]
        
df_tf_idf

Unnamed: 0,kondisi,baik,jalan,becek,sempit,anjur,akses,licin,curug,antar,ojek,indah,sejuk,tempat,hujan,lokasi,saran,tanah,bagus
0,0.047712,0.047712,0.0,0.017609,0.047712,0.0,0.0,0.0,0.0,0.0,0.0,0.047712,0.047712,0.0,0.0,0.047712,0.0,0.0,0.0
1,0.0,0.0,0.0,0.019566,0.0,0.0,0.0,0.053013,0.0,0.053013,0.053013,0.0,0.0,0.0,0.0,0.0,0.053013,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.05964,0.05964,0.0,0.0,0.0,0.0,0.0,0.0,0.05964,0.0,0.0,0.0,0.05964,0.05964
