In [2]:
import numpy as np
import pandas as pd
import json

In [3]:
# 1️⃣ Load data JSON
with open("../data/google_reviews.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [4]:
df_data = pd.DataFrame(data)
df_data.head()

Unnamed: 0,review_id,user_name,rating,date,review_text
0,Ci9DQUlRQUNvZENodHljRjlvT205MVoySk1jbE5uVEVaeV...,Gemilang Cahaya kencana,2 stars,3 months ago,Kursi dan beberapa permainan perlu diberikan p...
1,Ci9DQUlRQUNvZENodHljRjlvT2tSSU9ESXphVlJWWm5jMm...,Ahmad Yusuf,5 stars,2 weeks ago,Tempatnya sangat ramai ketika di hari libur se...
2,Ci9DQUlRQUNvZENodHljRjlvT21zNVYyNHRUREEyVnpGZm...,andik atmaja,5 stars,3 weeks ago,Alun-alun Lumajang dengan fasilitas yang relat...
3,Ci9DQUlRQUNvZENodHljRjlvT21SRGQzRXhaa1UwVlU1Ul...,Dian ella,5 stars,a week ago,"Alun alun lumajang,dengan semua fasilitas yang..."
4,Ci9DQUlRQUNvZENodHljRjlvT2xoclJrRnJOMHRuWVRsVW...,Silfi Silf,4 stars,5 months ago,"First time nyobain ke Alun alun lumajang Gara""..."


In [5]:
df_data.loc[df_data.duplicated(subset=['user_name','review_text'])]

Unnamed: 0,review_id,user_name,rating,date,review_text


In [6]:
df_data.drop(columns=['user_name'],inplace=True)

In [7]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_id    4908 non-null   object
 1   rating       4908 non-null   object
 2   date         4908 non-null   object
 3   review_text  4908 non-null   object
dtypes: object(4)
memory usage: 153.5+ KB


In [8]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import re

def parse_relative_date(text):
    now = datetime.now()
    text = text.lower()
    
    # year
    match = re.match(r'(a|\d+)\s+year', text)
    if match:
        n = 1 if match.group(1) == 'a' else int(match.group(1))
        return now - relativedelta(years=n)
    
    # month
    match = re.match(r'(a|\d+)\s+month', text)
    if match:
        n = 1 if match.group(1) == 'a' else int(match.group(1))
        return now - relativedelta(months=n)
    
    # week
    match = re.match(r'(a|\d+)\s+week', text)
    if match:
        n = 1 if match.group(1) == 'a' else int(match.group(1))
        return now - timedelta(weeks=n)
    
    # day
    match = re.match(r'(a|\d+)\s+day', text)
    if match:
        n = 1 if match.group(1) == 'a' else int(match.group(1))
        return now - timedelta(days=n)
    
    # fallback: return now kalau format nggak dikenali
    return now


In [9]:
df_data['date_parsed'] = df_data['date'].apply(parse_relative_date)
df_data['year'] = df_data['date_parsed'].dt.year

In [10]:
df_data['rating'] = (
    df_data['rating']
    .str.extract(r'(\d+)')   # ambil angka
    .astype(int)
)

In [11]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   review_id    4908 non-null   object        
 1   rating       4908 non-null   int64         
 2   date         4908 non-null   object        
 3   review_text  4908 non-null   object        
 4   date_parsed  4908 non-null   datetime64[ns]
 5   year         4908 non-null   int32         
dtypes: datetime64[ns](1), int32(1), int64(1), object(3)
memory usage: 211.0+ KB


In [12]:
df_data.describe()

Unnamed: 0,rating,date_parsed,year
count,4908.0,4908,4908.0
mean,4.611247,2021-09-05 02:26:03.773991936,2021.610636
min,1.0,2014-01-18 08:03:10.681247,2014.0
25%,4.0,2020-01-18 08:03:10.587581952,2020.0
50%,5.0,2021-01-18 08:03:10.691682048,2021.0
75%,5.0,2024-01-18 08:03:10.649445120,2024.0
max,5.0,2026-01-18 08:03:10.739630,2026.0
std,0.747833,,2.615068


In [13]:
df_data.to_csv("../data/reviews_clean.csv", index=False)


In [14]:
df_data.columns

Index(['review_id', 'rating', 'date', 'review_text', 'date_parsed', 'year'], dtype='object')

In [15]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()
stemmer.stem("perawatan")
# output: "rawat"


'awat'