**Translating Raw Data**
- Preprocess ulasan
- Translate with Gemini API
- Save translated dataframe

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import json

In [None]:
pd.set_option('display.max_colwidth', None)

# Import Dataset
*   Raw dataset: 2024-3-25_cleanedUlasan.csv

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/2024-3-25_cleanedUlasan.csv")

In [None]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Location,AccountName,ReviewCount,ulasan
0,1,Manna+Kampus+(Mirota+Kampus)+Babarsari,Reni NuryyatiLocal Guide ·,382.0,"Saat ramadhan dan mendekati lebaran seperti ini, seharusnya pelayanan lebih cepat.\rKassa satunya tutup. Pengunjung lainnya sampai pada jongkok krn sangat lama.\rSaya mengantri lbh dr 20 menit.\rMohon perbaiki lg pelayanannya. Di cabang yg lain tidak selama ini. Ini lama bukan krn belanjaan pengunjung banyak. Tp proses input ke mesin yang lama.\rTerima kasih."
1,2,Manna+Kampus+(Mirota+Kampus)+Babarsari,Bagus Nandar,2.0,"Sudah langganan berbelanja di sini dari dulu, tapi barusan kecewa sama pelayanan karyawati inisial ES, karena kasir bagian roti tutup, istri saya diharuskan ngantri lagi ke kasir swalayan,..\rpoin di sini adalah di saat pelanggan komplain..petugas malah sewot, nggak ada senyum, nggak ada ramah, nggak ada maaf, harus dididik lagi supaya bisa menanggapin komplain dengan baik, bukan dengan cara argumen dengan nada yang tinggi/marah²"


In [None]:
# Drop unused column
df = df.drop(df.columns[0], axis=1)
df.info()

# Create translate column
df['translate'] = None
# Remove Null in 'ulasan'
df.dropna(subset=['ulasan'], inplace=True)
# Remove row only containing space in 'ulasan'
df = df[df['ulasan'].str.strip() != '']
df = df.copy()
# Remove new line character from 'ulasan'
df['ulasan'] = df['ulasan'].str.replace('\r', ' ')
df['ulasan'] = df['ulasan'].str.replace('\n', ' ')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4760 entries, 0 to 4759
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Location     4760 non-null   object 
 1   AccountName  4746 non-null   object 
 2   ReviewCount  4752 non-null   float64
 3   ulasan       2852 non-null   object 
dtypes: float64(1), object(3)
memory usage: 148.9+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 2849 entries, 0 to 4750
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Location     2849 non-null   object 
 1   AccountName  2842 non-null   object 
 2   ReviewCount  2849 non-null   float64
 3   ulasan       2849 non-null   object 
 4   translate    0 non-null      object 
dtypes: float64(1), object(4)
memory usage: 133.5+ KB


# Cleaning text data

In [None]:
# Remove emoji and unknown character from 'ulasan'
import re

def clean_text(text):
  if isinstance(text, str):
    # Remove special characters and punctuation
    text = re.sub(r"[^\w\s]", " ", text)

    # Remove single characters
    text = re.sub(r"\b[a-zA-Z]\b", " ", text)

    # Remove HTML tags
    text = re.sub(r"<[^>]*>", " ", text)

    # Lowercase the text
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    # Trim leading and trailing spaces
    text = text.strip()

    # Pattern to match emojis
    emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                "]+", flags=re.UNICODE)

    text = emoji_pattern.sub(r'', text)

    return text
  else:
    return text

In [None]:
# Apply clean_text function to 'ulasan'
df['ulasan'] = df['ulasan'].apply(clean_text)

In [None]:
# Change "+" to space in 'Location'
df['Location'] = df['Location'].str.replace('+', ' ')

In [None]:
# Add row_number
df['Row_Num'] = df.index

In [None]:
df.head(2)

Unnamed: 0,Location,AccountName,ReviewCount,ulasan,translate,Row_Num
0,Manna Kampus (Mirota Kampus) Babarsari,Reni NuryyatiLocal Guide ·,382.0,saat ramadhan dan mendekati lebaran seperti ini seharusnya pelayanan lebih cepat kassa satunya tutup pengunjung lainnya sampai pada jongkok krn sangat lama saya mengantri lbh dr 20 menit mohon perbaiki lg pelayanannya di cabang yg lain tidak selama ini ini lama bukan krn belanjaan pengunjung banyak tp proses input ke mesin yang lama terima kasih,,0
1,Manna Kampus (Mirota Kampus) Babarsari,Bagus Nandar,2.0,sudah langganan berbelanja di sini dari dulu tapi barusan kecewa sama pelayanan karyawati inisial es karena kasir bagian roti tutup istri saya diharuskan ngantri lagi ke kasir swalayan poin di sini adalah di saat pelanggan komplain petugas malah sewot nggak ada senyum nggak ada ramah nggak ada maaf harus dididik lagi supaya bisa menanggapin komplain dengan baik bukan dengan cara argumen dengan nada yang tinggi marah²,,1


# Translating 'ulasan' into English using Google Gemini API
**Checkpoint 1**
*   JSON saved in tes_translate

In [None]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata

In [None]:
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [None]:
safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE"
    },
]

In [None]:
# Create Gemini instance using 'gemini-1.0-pro-latest' model
model = genai.GenerativeModel(model_name='gemini-1.0-pro-latest',
                              safety_settings=safety_settings)

In [None]:
import time
import json

file_num = 0
batch = 10

for i in range(0, len(df), batch):
  # Get 10 rows of 'ulasan'
  df_batch = df[['Row_Num', 'ulasan', 'translate']].iloc[i:i+batch]
  # Convert to JSON
  json_data = df_batch[['Row_Num', 'ulasan','translate']].to_json(orient='records')

  prompt = f"""
  You are an expert linguist, who is good at translating sentences from Indonesian to English.
  Help me translate sentences provided between three backticks.
  Please translate with good grammar.
  Please do not translate word by word.
  In your output, only return the JSON code as output - which is provided between three backticks.
  If there are a quotation mark or anything symbol may cause error in the JSON code in a sentence, please remove it from the output JSON.
  Your task is to update translate labels under 'translate' in the JSON code.
  Error handling instruction: In case a sentence violates API policy, please assign it as Translation Error.
  Don't make any changes to the JSON code format, please.

  ```
  {json_data}
  ```
  """

  # Generate result
  response = model.generate_content(prompt)

  json_result = response.text.strip("`")

  try:
    data = json.loads(json_result)
    print(data)
    # Save result JSON
    file_path = f'/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/tes_translate/{file_num}_df_{i}_{i+batch}.json'
    json_string = json.dumps(data)

    with open(file_path, 'w') as file:
      file.write(json_string)

    print("JSON data saved to", file_path)
    print("------------------------------------------------------------------------------------")

    file_num += 1
    time.sleep(10)
  except:
    print("Passed: ", file_num)
    print("------------------------------------------------------------------------------------")

    file_num += 1
    pass



[{'Row_Num': 0, 'ulasan': "during ramadan and Eid, the services should have been faster. one cashier was closed, and the other customers squatted since they had to wait for over 20 minutes. Please improve the services. It's not because there were many customers, but the input process was slow. Thank you.", 'translate': 'during Ramazan and Eid, the services should have been faster one cashier was closed, and the other customers squatted since they had to wait for over 20 minutes Please improve the services in other branches It is not because there were many customers, but the input process was slow Thank you'}, {'Row_Num': 1, 'ulasan': "I have been a regular customer here. I'm disappointed with the service by the female employee ES since the bread cashier was closed. My wife had to queue again at the self-service cashier. The point is when the customer complained, the staff got upset, not smiling, being rude, and not saying sorry. They should be trained on how to respond to complaints p

# Check failed translation batch

In [None]:
# import os

# List all files in the current directory
files = os.listdir('/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/tes_translate')

# Extract numbers from filenames and convert them to integers
numbers = [int(filename.split('_')[0]) for filename in files]

# Sort the numbers
numbers.sort()

# Check for skipped numbers
skipped_numbers = []
for i in range(len(numbers) - 1):
    if numbers[i+1] - numbers[i] != 1:
        skipped_numbers.append(numbers[i] + 1)

if skipped_numbers:
    print("Skipped batch detected:", skipped_numbers)
else:
    print("No skipped batch detected.")


Skipped batch detected: [13, 38, 41, 50, 55, 80, 89, 92, 116, 165, 174, 200, 227, 242, 250, 262, 265, 281]


# Append translate to dataframe

In [None]:
json_dir = '/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/tes_translate'

In [None]:
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        filepath = os.path.join(json_dir, filename)
        with open(filepath, 'r') as file:
            # Load JSON data
            json_data = json.load(file)
            for x in json_data:
              df.at[x['Row_Num'], 'translate'] = x['translate']

**Checkpoint 2**
*   Dataframe saved as 2024-4-23_translatedUlasanGemini.csv

In [None]:
# Save dataframe with translation as csv
df.to_csv('/content/drive/MyDrive/Colab Notebooks/2024-3-31_HRA/2024-4-23_translatedUlasanGemini.csv', index=False)