In [35]:
import pandas as pd
import numpy as np

In [36]:
#filepaths to data
fp_en_cn=r'../dataset/Shopee_CN_to_EN.csv'
fp_en_es=r'../dataset/Amazon_en_to_es.csv'
fp_en_it=r'../dataset/target_en_to_it.csv'

#read to df
df_cn=pd.read_csv(fp_en_cn)
df_es=pd.read_csv(fp_en_es)
df_it=pd.read_csv(fp_en_it)

#drop duplicates
df_cn=df_cn.drop_duplicates()
df_es=df_es.drop_duplicates()
df_it=df_it.drop_duplicates()

In [37]:
print('cn dataset size:'+str(df_cn.shape))
print('es dataset size:'+str(df_es.shape))
print('it dataset size:'+str(df_it.shape))

cn dataset size:(1000, 3)
es dataset size:(996, 2)
it dataset size:(873, 4)


In [38]:
#Dataset cleaning

import re
import unicodedata


def clean_text(text):
    """
    Cleans text while preserving accents in Latin-based languages and Chinese characters.
    
    - Lowercases (for English, Spanish, Italian)
    - Removes URLs, emails
    - Keeps accents and Chinese characters
    - Removes brackets, asterisks, and other special symbols
    - Normalizes spaces
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs and emails
    text = re.sub(r'http\S+|www\S+|https\S+|\S+@\S+', '', text, flags=re.MULTILINE)

    # Normalize Unicode (preserve accents but remove weird artifacts)
    text = unicodedata.normalize('NFKC', text)

    # Remove unwanted special characters (but keep accents and Chinese)
    text = re.sub(r'[\[\]{}*<>|^$#@~`_=+\\]', '', text)  # Removes brackets, asterisks, etc.

    # Keep only alphanumeric characters, punctuation, spaces, and Chinese characters
    text = re.sub(r'[^\w\s.,;!?áéíóúüñàèìòùçäöüßẞāēīōūǎěǐǒǔǖǘǚǜ一-龥]', '', text)

    # Replace multiple spaces/newlines with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    #text = '<start> ' + text + ' <end>'
    
    return text

In [39]:
df_es

Unnamed: 0,title,title_spanish
0,brother genuine high yield toner cartridge tn4...,hermano genuino cartucho tóner de alto rendimi...
1,fitbit inspire 3 health and fitness tracker wi...,fitbit inspirar 3 seguimiento de salud y fitne...
2,mikes hot honey americas 1 brand of hot honey ...,mikes miel caliente américas 1 marca de miel c...
3,krema kréma red fruits 100 recyclable 240g,krema kréma frutos rojos 100 reciclables 240g
4,drsalts calming therapy epsom salts soothing ...,drsalts calmante terapia epsom sales calmantes...
...,...,...
995,ruimen smart watches for men women answermake ...,ruimen relojes inteligentes para hombres mujer...
996,musicozy sleep headphones bluetooth 54 headban...,auriculares musicozy sueño bluetooth 54 diadem...
997,sun ninja pop up beach tent sun shelter upf50 ...,sun ninja pop up playa refugio de sol upf50 co...
998,rhino usa trailer hitch pin 2 inch patented 58...,enganche de remolque de rinoceronte usa pin de...


In [40]:
df_cn['translation_output']=df_cn['translation_output'].apply(clean_text)
df_cn['text']=df_cn['text'].apply(clean_text)

df_es['title']=df_es['title'].apply(clean_text)
df_es['title_spanish']=df_es['title_spanish'].apply(clean_text)

df_it['title']=df_it['title'].apply(clean_text)
df_it['title_italian']=df_it['title_italian'].apply(clean_text)

In [41]:
df_it=df_it.drop(['product_description','productDescription_italian'], axis=1)

df_cn.rename(columns={'translation_output':'source_text', 'text':'target_text'}, inplace=True)
df_es.rename(columns={'title':'source_text', 'title_spanish':'target_text'}, inplace=True)
df_it.rename(columns={'title':'source_text', 'title_italian':'target_text'}, inplace=True)

Given that we have a small size ~1k entries per dataset, I am going to go off the top of my ass and say we don't have enough data to train a transformer from scratch. Hence, we will either use an LSTM or use the data to finetune a pretrianed LLM.

In [42]:
from sklearn.model_selection import train_test_split

def split_data(data):
    train_data, temp_data= train_test_split(
        data, test_size=0.2, random_state=42
    )
    val_data, test_data= train_test_split(
        temp_data, test_size=0.5, random_state=42
    )

    print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")
    return train_data, val_data, test_data

In [43]:
#Split data into train test and val sets
cn_train, cn_val, cn_test=split_data(df_cn)
es_train, es_val, es_test=split_data(df_es)
it_train, it_val, it_test=split_data(df_it)

Train: 800, Val: 100, Test: 100
Train: 796, Val: 100, Test: 100
Train: 698, Val: 87, Test: 88


In [44]:
cn_train.to_csv('en_cn_train.csv', encoding="utf-8", index=False)
cn_val.to_csv('en_cn_val.csv', encoding="utf-8", index=False)
cn_test.to_csv('en_cn_test.csv', encoding="utf-8", index=False)

es_train.to_csv('en_es_train.csv', index=False)
es_val.to_csv('en_es_val.csv', index=False)
es_test.to_csv('en_es_test.csv', index=False)

it_train.to_csv('en_it_train.csv', index=False)
it_val.to_csv('en_it_val.csv', index=False)
it_test.to_csv('en_it_test.csv', index=False)

In [45]:
test=pd.read_csv('en_cn_test.csv')

In [46]:
test

Unnamed: 0,source_text,target_text,split
0,estee lauder young 10 cream 15ml 2,estee lauder 雅詩蘭黛 年輕肌密無敵霜15ml2,private
1,macaron water sprayer 300ml 132671,馬卡龍噴水器300ml132671,private
2,nike ultimate flight ankle sock basketball soc...,nike 耐吉 ultimate flight ankle sock 籃球短襪 運動短襪 s...,private
3,custom small square gentleman loafers black an...,訂製款小方頭紳士樂福鞋黑灰白12020246,private
4,speed park parktool scw 16 hi drum with open w...,速度公園parktool scw16 專業型花鼓用開口扳手,private
...,...,...,...
95,gold bird essence 20ml,德妍思 金雀花精華液20ml,private
96,paris laya perfect uv conditioning protective ...,巴黎萊雅完美uv全效防護隔離乳液spf50膚,private
97,crystal ball cb rachel lace flower,crystal ball cb rachel lace flower,private
98,sanrio ring buckle phone holder hello kitty he...,sanrio三麗鷗 指環扣 手機支架hello kitty大頭系列,private
