**Preprocess Data**
*   Remove nan value from review
*   Remove row containing non review text
*   Split review into sentences

In [None]:
import pandas as pd
import os

# Import dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/2024-4-23_translatedUlasanGemini.csv')

# Remove nan value from translate

In [None]:
df.info()
df = df.dropna(subset=['translate'])
df.info()
df = df.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2849 entries, 0 to 2848
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Location     2849 non-null   object 
 1   AccountName  2842 non-null   object 
 2   ReviewCount  2849 non-null   float64
 3   ulasan       2838 non-null   object 
 4   translate    2569 non-null   object 
 5   Row_Num      2849 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 133.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 2569 entries, 0 to 2848
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Location     2569 non-null   object 
 1   AccountName  2563 non-null   object 
 2   ReviewCount  2569 non-null   float64
 3   ulasan       2566 non-null   object 
 4   translate    2569 non-null   object 
 5   Row_Num      2569 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 140

# Remove non review text

In [None]:
# Filter rows based on condition
df = df[~df['translate'].str.contains('translate|google')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2479 entries, 0 to 2848
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Location     2479 non-null   object 
 1   AccountName  2474 non-null   object 
 2   ReviewCount  2479 non-null   float64
 3   ulasan       2476 non-null   object 
 4   translate    2479 non-null   object 
 5   Row_Num      2479 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 135.6+ KB


In [None]:
df['good_translate'] = None

# Repair translation grammar using Google Gemini API
**Checkpoint 3**
*   JSON saved in tes_good_translate

In [None]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata

In [None]:
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [None]:
safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE"
    },
]

In [None]:
# Create Gemini instance using 'gemini-1.0-pro-latest' model
model = genai.GenerativeModel(model_name='gemini-1.0-pro-latest',
                              safety_settings=safety_settings)

In [None]:
import time
import json

file_num = 0
batch = 10

for i in range(0, len(df), batch):
  # Get 10 rows of 'ulasan'
  df_batch = df[['Row_Num', 'translate', 'good_translate']].iloc[i:i+batch]
  # Convert to JSON
  json_data = df_batch[['Row_Num','translate', 'good_translate']].to_json(orient='records')

  prompt = f"""
  You are an expert linguist, who is good at checking and repairing the grammar in a text.
  Help me to repair sentences provided between three backticks.
  Please repair with good grammar, do not use contraction.
  In your output, only return the JSON code as output - which is provided between three backticks.
  If there are a quotation mark or anything symbol may cause error in the JSON code in a sentence, please remove it from the output JSON.
  Your task is to update good_translate labels under 'good_translate' in the JSON code.
  Error handling instruction: In case a sentence violates API policy, please assign it with the original text.
  Don't make any changes to the JSON code format, please.

  ```
  {json_data}
  ```
  """

  # Generate result
  response = model.generate_content(prompt)

  json_result = response.text.strip("`")

  try:
    data = json.loads(json_result)
    print(data)
    # Save result JSON
    file_path = f'/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/tes_good_translate/{file_num}_df_{i}_{i+batch}.json'
    json_string = json.dumps(data)

    with open(file_path, 'w') as file:
      file.write(json_string)

    print("JSON data saved to", file_path)
    print("------------------------------------------------------------------------------------")

    file_num += 1
    time.sleep(10)
  except:
    print("Passed: ", file_num)
    print("------------------------------------------------------------------------------------")

    file_num += 1
    pass



[{'Row_Num': 0, 'translate': 'during Ramazan and Eid, the services should have been faster one cashier was closed, and the other customers squatted since they had to wait for over 20 minutes Please improve the services in other branches It is not because there were many customers, but the input process was slow Thank you', 'good_translate': 'During Ramadan and Eid, the services should have been faster. One cashier was closed, and the other customers squatted since they had to wait for over 20 minutes. Please improve the services in other branches. It is not because there were many customers, but the input process was slow. Thank you.'}, {'Row_Num': 1, 'translate': 'I have been a regular customer here I am disappointed with the service by the female employee ES since the bread cashier was closed My wife had to queue again at the self service cashier The point is when the customer complained, the staff got upset, not smiling, being rude, and not saying sorry They should be trained on how

In [None]:
# import os

# List all files in the current directory
files = os.listdir('/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/tes_good_translate')

# Extract numbers from filenames and convert them to integers
numbers = [int(filename.split('_')[0]) for filename in files]

# Sort the numbers
numbers.sort()

# Check for skipped numbers
skipped_numbers = []
for i in range(len(numbers) - 1):
    if numbers[i+1] - numbers[i] != 1:
        skipped_numbers.append(numbers[i] + 1)

if skipped_numbers:
    print("Skipped batch detected:", skipped_numbers)
else:
    print("No skipped batch detected.")


Skipped batch detected: [34, 101, 113, 123, 128, 199]


In [None]:
json_dir = '/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/tes_good_translate'

In [None]:
df

Unnamed: 0,Location,AccountName,ReviewCount,ulasan,translate,Row_Num,good_translate
0,Manna Kampus (Mirota Kampus) Babarsari,Reni NuryyatiLocal Guide ·,382.0,saat ramadhan dan mendekati lebaran seperti in...,"during Ramazan and Eid, the services should ha...",0,
1,Manna Kampus (Mirota Kampus) Babarsari,Bagus Nandar,2.0,sudah langganan berbelanja di sini dari dulu t...,I have been a regular customer here I am disap...,1,
2,Manna Kampus (Mirota Kampus) Babarsari,SHiNBi CiPoNGLocal Guide ·,16.0,security yg bernama lukman dan namnung bad att...,the security guard named Lukman and Namnung ha...,2,
3,Manna Kampus (Mirota Kampus) Babarsari,zulfia ashifa,4.0,please dong area parkirnya di tambahin buat mo...,please add a parking area for cars I was surpr...,3,
4,Manna Kampus (Mirota Kampus) Babarsari,fadhila witaLocal Guide ·,12.0,super gerah bgt swalayannya udah tau dia rame ...,the store is very hot They should be aware tha...,4,
...,...,...,...,...,...,...,...
2844,Manna Kampus (Mirota Kampus) Mini Diro,Sari Yanto,3.0,komplit,Complete,4746,
2845,Manna Kampus (Mirota Kampus) Mini Diro,Pasha Alif NanditamaLocal Guide ·,334.0,manna kampus mini diro merupakan minimarket ya...,Manna Kampus Mini Dira is a minimarket that se...,4747,
2846,Manna Kampus (Mirota Kampus) Mini Diro,heri nuryanto,3.0,akhirnya tak perlu jauh jauh ke manna kampus j...,No need to rush to Manna Kampus Jl. Godean is ...,4748,
2847,Manna Kampus (Mirota Kampus) Mini Diro,Surya Edi Poernomo,2.0,pelayanannya sungguh memuaskan harganya masuk ...,"The service is really satisfying, the price is...",4749,


In [None]:
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        filepath = os.path.join(json_dir, filename)
        with open(filepath, 'r') as file:
            # Load JSON data
            json_data = json.load(file)
            for x in json_data:
              df.loc[df['Row_Num'] == x['Row_Num'], 'good_translate'] = x['good_translate']

In [None]:
df

Unnamed: 0,Location,AccountName,ReviewCount,ulasan,translate,Row_Num,good_translate
0,Manna Kampus (Mirota Kampus) Babarsari,Reni NuryyatiLocal Guide ·,382.0,saat ramadhan dan mendekati lebaran seperti in...,"during Ramazan and Eid, the services should ha...",0,"During Ramadan and Eid, the services should ha..."
1,Manna Kampus (Mirota Kampus) Babarsari,Bagus Nandar,2.0,sudah langganan berbelanja di sini dari dulu t...,I have been a regular customer here I am disap...,1,I have been a regular customer here. I am disa...
2,Manna Kampus (Mirota Kampus) Babarsari,SHiNBi CiPoNGLocal Guide ·,16.0,security yg bernama lukman dan namnung bad att...,the security guard named Lukman and Namnung ha...,2,The security guard named Lukman and Namnung ha...
3,Manna Kampus (Mirota Kampus) Babarsari,zulfia ashifa,4.0,please dong area parkirnya di tambahin buat mo...,please add a parking area for cars I was surpr...,3,Please add a parking area for cars. I was surp...
4,Manna Kampus (Mirota Kampus) Babarsari,fadhila witaLocal Guide ·,12.0,super gerah bgt swalayannya udah tau dia rame ...,the store is very hot They should be aware tha...,4,The store is very hot. They should be aware th...
...,...,...,...,...,...,...,...
2844,Manna Kampus (Mirota Kampus) Mini Diro,Sari Yanto,3.0,komplit,Complete,4746,Complete.
2845,Manna Kampus (Mirota Kampus) Mini Diro,Pasha Alif NanditamaLocal Guide ·,334.0,manna kampus mini diro merupakan minimarket ya...,Manna Kampus Mini Dira is a minimarket that se...,4747,Manna Kampus Mini Dira is a minimarket that se...
2846,Manna Kampus (Mirota Kampus) Mini Diro,heri nuryanto,3.0,akhirnya tak perlu jauh jauh ke manna kampus j...,No need to rush to Manna Kampus Jl. Godean is ...,4748,No need to rush to Manna Kampus Jl. Godean is ...
2847,Manna Kampus (Mirota Kampus) Mini Diro,Surya Edi Poernomo,2.0,pelayanannya sungguh memuaskan harganya masuk ...,"The service is really satisfying, the price is...",4749,"The service is really satisfying, the price is..."


**Checkpoint 3**
*   Dataframe saved as 2024-4-24_repairTranslationGemini.csv

In [None]:
# Save dataframe with translation as csv
df.to_csv('/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/2024-4-24_repairTranslationGemini.csv', index=False)

# Split review into sentence

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/2024-4-24_repairTranslationGemini.csv')

In [None]:
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer models if not already downloaded
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df['split_sentence'] = None

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2479 entries, 0 to 2848
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Location        2479 non-null   object 
 1   AccountName     2474 non-null   object 
 2   ReviewCount     2479 non-null   float64
 3   ulasan          2476 non-null   object 
 4   translate       2479 non-null   object 
 5   Row_Num         2479 non-null   int64  
 6   good_translate  2377 non-null   object 
 7   split_sentence  2377 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 238.8+ KB


In [None]:
for index, row in df.iterrows():
  try:
    paragraph = row['good_translate']

    # Split the paragraph into sentences
    sentences = sent_tokenize(paragraph)
    # Append each sentence to a list
    sentences_list = []
    for sentence in sentences:
      sentences_list.append(sentence)
    df.at[index, 'split_sentence'] = sentences_list
  except:
    print("Failed at: ", index)
    print(row['good_translate'])
    print("-------------------------")

Failed at:  371
None
-------------------------
Failed at:  390
None
-------------------------
Failed at:  391
None
-------------------------
Failed at:  392
None
-------------------------
Failed at:  393
None
-------------------------
Failed at:  394
None
-------------------------
Failed at:  395
None
-------------------------
Failed at:  396
None
-------------------------
Failed at:  397
None
-------------------------
Failed at:  398
None
-------------------------
Failed at:  399
None
-------------------------
Failed at:  400
None
-------------------------
Failed at:  401
None
-------------------------
Failed at:  402
None
-------------------------
Failed at:  403
None
-------------------------
Failed at:  404
None
-------------------------
Failed at:  405
None
-------------------------
Failed at:  406
None
-------------------------
Failed at:  407
None
-------------------------
Failed at:  408
None
-------------------------
Failed at:  775
None
-------------------------
Failed at:  7

In [None]:
df.info()
df = df.dropna(subset=['split_sentence'])
df.info()
df = df.copy()

<class 'pandas.core.frame.DataFrame'>
Index: 2479 entries, 0 to 2848
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Location        2479 non-null   object 
 1   AccountName     2474 non-null   object 
 2   ReviewCount     2479 non-null   float64
 3   ulasan          2476 non-null   object 
 4   translate       2479 non-null   object 
 5   Row_Num         2479 non-null   int64  
 6   good_translate  2377 non-null   object 
 7   split_sentence  2377 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 238.8+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 2377 entries, 0 to 2848
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Location        2377 non-null   object 
 1   AccountName     2372 non-null   object 
 2   ReviewCount     2377 non-null   float64
 3   ulasan          2374 non-null   object 
 4   translate

**Checkpoint 4**
*   Dataframe saved as 2024-4-24_preprocessedUlasan.csv

In [None]:
# Save dataframe
df.to_csv('/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/2024-4-24_preprocessedUlasan.csv', index=False)