In [1]:
!nvidia-smi

Wed Jun 21 07:18:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.94       Driver Version: 516.94       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| 45%   31C    P8    N/A /  75W |    680MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# Import Libary
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from matplotlib.ticker import MaxNLocator

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from wordcloud import WordCloud

import gensim
from gensim import models
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases, CoherenceModel
import gensim.corpora as corpora

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Load Dataset

In [3]:
# Load Dataset
df = pd.read_csv("../dataset/gmaps_review_dataset_v2.csv")

In [4]:
# Show first 5 rows
df.head()

Unnamed: 0,id,PIC,datetime,status,scrapped_at,location,rating,text,aksesibilitas,fasilitas (akomodasi dan amenitas),aktivitas (atraksi dan aktivitas)
0,47d0cdd937754bd6b860f89b2bab1dbb,Fakhri,2022-05-15 11:58:43,Valid,2023-02-15 11:58:43,Curug Malela,4,Akses jalannya waktu itu masih sulit di jangka...,2,0,0
1,4804acd6c05e4f89b098e2ca35019419,Fakhri,2022-08-15 11:58:43,Valid,2023-02-15 11:58:43,Curug Malela,5,"Perjalanan yg bnr"" bikin Syahduu ,, dr Tempat ...",0,0,1
2,3eae265bf32a45eca31765a4145bc030,Fakhri,2022-03-15 11:58:43,Valid,2023-02-15 11:58:43,Curug Malela,5,"Minggu 13 februari 2022 ,\n\ngas santai pakai ...",1,2,1
3,61037dbdb7b14045be49d4494e95cf05,Fakhri,2022-05-15 11:58:44,Valid,2023-02-15 11:58:44,Curug Malela,5,7 mei 2022\nTouring bari mudik\nMntap perjalan...,1,0,1
4,a2c9e817e2b949c6880f971f43a11d2f,Fakhri,2022-08-15 11:58:44,Valid,2023-02-15 11:58:44,Curug Malela,5,Perjalanan touring motor dari bekasi melewati ...,0,0,1


In [5]:
# Rename column
df = df.rename(columns={'date status':'status'})

In [6]:
# Drop column
df = df.drop(columns=['PIC', 'scrapped_at', 'rating', 'aksesibilitas', 'fasilitas (akomodasi dan amenitas)', 'aktivitas (atraksi dan aktivitas)'])

In [7]:
# Show lenght and type of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13587 entries, 0 to 13586
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        13587 non-null  object
 1   datetime  13587 non-null  object
 2   status    13587 non-null  object
 3   location  13587 non-null  object
 4   text      13587 non-null  object
dtypes: object(5)
memory usage: 530.9+ KB


In [8]:
# Convert column type to datetime type
df['datetime'] = df['datetime'].apply(pd.to_datetime)

# Preprocessing

## Data Filtering

In [9]:
print(f'Length data before processing: {len(df)}')

Length data before processing: 13587


In [10]:
mask = (df['datetime'] >= '2019-1-1')
df = df.loc[mask]

In [11]:
df = df[df['location'] != 'Situ Lembang Dano']

In [12]:
df = df[df['status'] != 'Invalid']

In [13]:
df = df.drop_duplicates(subset=['text'])

In [14]:
print(f'Length data after processing: {len(df)}')

Length data after processing: 11647


## Data Cleaning

In [15]:
def remove_destination_name(text):
    keywords = list(map(lambda x: x.lower(), df['location'].unique()))
    keywords += [
        'curug tilu', 'kebun begonia', 'gunung putri', 'curug layung', 
        'curug pelangi', 'curug cimahi', 'curug sawer', 'sirtwo island',
        'floating market', 'tangkuban parahu', 'tangkuban perahu', 'orchid forest', 
        'ciwangun indah camp', 'cic', 'kawah ratu', 'situ ciburuy', 'curug malela',
        'kawah domas',
    ]
    for keyword in keywords:
        if keyword in text:
            text = re.sub(keyword, ' ', text)
    return text

In [16]:
def get_translate_text(text):
    keyword = ['(Diterjemahkan oleh Google)', '(Asli)']
    if keyword[0] in text and keyword[1] in text:
        text = text[text.find(keyword[0]) + len(keyword[0]):text.rfind(keyword[1])]
    elif keyword[0] in text and keyword[1] not in text:
        text = text[text.find(keyword[0]) + len(keyword[0]):]
    return text 

In [17]:
def clean_text(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r' ', text) # no emoji
    text = text.lower() #to lowercase
    text = re.sub('\n', ' ', text) #remove \n
    text = re.sub(r'[^\w\s]', ' ', text) #remove punctuation
    text = re.sub("\d+", "", text) #remove number
    text = re.sub(' +', ' ', text) #remove multiple whitespace
    
    if re.search(r'([a-zA-Z])\1{2,}', text): # Spell
        text = re.sub(r'([a-zA-Z])\1{2,}','\\1', text)
        
    return text

In [18]:
df['text_preprocess'] = df['text'].apply(remove_destination_name)
df['text_preprocess'] = df['text_preprocess'].apply(get_translate_text)
df['text_preprocess'] = df['text_preprocess'].apply(clean_text)

## Tokenizing

In [19]:
df['tokens'] = df['text_preprocess'].apply(lambda x: word_tokenize(x))

## Stopwords removal

In [20]:
# ''' Stopword by Sastrawi '''

# stop_words = StopWordRemoverFactory().get_stop_words()
# extend_stopword = [
#     'pas', 'yg', 'tp', 'rb', 'km', 'tp', 'dg', 'dr',
#     'bagu','bagus','bagusnya',
#     'banyaknya','banyakny','banyak',
#     'jlan','jalan',
#     'masuk','tempat',
#     'atas','sana','karna','sama','saya','sudah','udah','mungkin', 'tuji','jadi','saja','sini','kalo','kalau',
#     'untuk','semua','buat','bisa','cuma','sangat','buat','unutk','paling','cukup','naik','ribu','sekali',
#     'benar','kesini','turun','sera','sayangny','anjur','mending','dapat', 'punya','mana','sekitar',
#     'dapet','coba','bukan','sebut','sampe','tuju','hingga','beberapa','banget',
#     'utk','gak','adl','dll','coba','klo','nya','pas','per','bnr','udh','lua','aja','bgt','lalu','krn','mau',
# ]

# stop_words += extend_stopword

In [21]:
# ''' Stopword by NLTK '''

# stop_words = stopwords.words('indonesian')
# # stop_words = sorted(list(set(stopwords.words('indonesian')) - set(['jauh'])))
# # stop_words.extend([
# #     'pas', 'yg', 'tp', 'rb', 'km', 'tp', 'dg', 'dr', 'nya',
# #     'bagu','bagus','bagusnya',
# #     'banyaknya','banyakny','banyak',
# #     'jlan','jalan',
# #     'masuk','tempat',
# #     'atas','sana','karna','sama','saya','sudah','udah','mungkin', 'tuji','jadi','saja','sini','kalo','kalau',
# #     'untuk','semua','buat','bisa','cuma','sangat','buat','unutk','paling','cukup','naik','ribu','sekali',
# #     'benar','kesini','turun','sera','sayangny','anjur','mending','dapat', 'punya','mana','sekitar',
# #     'dapet','coba','bukan','sebut','sampe','tuju','hingga','beberapa','banget',
# #     'utk','gak','adl','dll','coba','klo','nya','pas','per','bnr','udh','lua','aja','bgt','lalu','krn','mau',
# # ])

In [22]:
# df['tokens'] = df['tokens'].apply(lambda x: [w for w in x if not w in stop_words])

## Stemming

In [23]:
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()
 
# df['tokens'] = df['tokens'].apply(lambda x: stemmer.stem(' '.join(x)).split(' '))

## Additional Step

In [24]:
def reduction_character_less_than_n(text):
    new_text = []
    char_less_than_threshold = 2
    length_word = len(text)
    for word_i in range(length_word):
        if len(text[word_i]) > char_less_than_threshold:
            new_text += [text[word_i]]
    
    return new_text

In [25]:
df['tokens'] = df['tokens'].apply(reduction_character_less_than_n)

In [26]:
df['len_char'] = df['text_preprocess'].apply(lambda x: len(x))
df['len_word'] = df['text_preprocess'].apply(lambda x: len(x.split(' ')))

In [27]:
df = df.drop_duplicates(subset=['text_preprocess'])

In [28]:
len(df)

11555

In [29]:
df['len_token'] = df['tokens'].apply(lambda x: len(x))

In [30]:
df = df[df['len_token'] > 2]
df = df[df['len_token'] <= 300]

In [31]:
print(f'Length data after processing: {len(df)}')

Length data after processing: 11064


# Export Dataset

In [32]:
df.head()

Unnamed: 0,id,datetime,status,location,text,text_preprocess,tokens,len_char,len_word,len_token
0,47d0cdd937754bd6b860f89b2bab1dbb,2022-05-15 11:58:43,Valid,Curug Malela,Akses jalannya waktu itu masih sulit di jangka...,akses jalannya waktu itu masih sulit di jangka...,"[akses, jalannya, waktu, itu, masih, sulit, ja...",84,13,12
1,4804acd6c05e4f89b098e2ca35019419,2022-08-15 11:58:43,Valid,Curug Malela,"Perjalanan yg bnr"" bikin Syahduu ,, dr Tempat ...",perjalanan yg bnr bikin syahduu dr tempat park...,"[perjalanan, bnr, bikin, syahduu, tempat, park...",110,19,15
3,61037dbdb7b14045be49d4494e95cf05,2022-05-15 11:58:44,Valid,Curug Malela,7 mei 2022\nTouring bari mudik\nMntap perjalan...,mei touring bari mudik mntap perjalanan sungg...,"[mei, touring, bari, mudik, mntap, perjalanan,...",162,25,20
4,a2c9e817e2b949c6880f971f43a11d2f,2022-08-15 11:58:44,Valid,Curug Malela,Perjalanan touring motor dari bekasi melewati ...,perjalanan touring motor dari bekasi melewati ...,"[perjalanan, touring, motor, dari, bekasi, mel...",152,23,22
5,772bd029c2ce450b94b6fc6f524fbc05,2022-09-15 11:58:44,Valid,Curug Malela,Nyampe 17:30 lnjut kebawah jam 18:00 mantap su...,nyampe lnjut kebawah jam mantap suasana safar,"[nyampe, lnjut, kebawah, jam, mantap, suasana,...",46,8,7


In [33]:
# Drop column
df = df.drop(columns=['datetime', 'status', 'location', 'text'])

In [34]:
df.to_csv('../dataset/dataset_preprocess_without_stopwords_stemming.csv', index=False)