In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
import json
import requests
from collections import Counter
from PIL import Image
from matplotlib.ticker import FuncFormatter
from tqdm import tqdm
#------------------------------------- 
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import joblib
from scipy import stats 
import folium
from folium.plugins import FastMarkerCluster, Fullscreen, MiniMap, HeatMap, HeatMapWithTime, LocateControl

In [3]:
path = '../data/'
#Importing Datasets
df_item = pd.read_csv(path+"olist_order_items_dataset.csv")
df_reviews = pd.read_csv(path+"olist_order_reviews_dataset.csv")
df_orders = pd.read_csv(path+"olist_orders_dataset.csv")
df_products = pd.read_csv(path+"olist_products_dataset.csv")
df_geolocation = pd.read_csv(path+"olist_geolocation_dataset.csv")
df_sellers = pd.read_csv(path+"olist_sellers_dataset.csv")
df_order_pay = pd.read_csv(path+"olist_order_payments_dataset.csv")
df_customers = pd.read_csv(path+"olist_customers_dataset.csv")
df_category = pd.read_csv(path+"product_category_name_translation.csv")

In [3]:
df_list = [df_customers, df_item, df_order_pay, df_reviews, df_orders, df_products, df_sellers, df_geolocation, df_category]

In [4]:
# Merging Datasets
df_train = df_orders.merge(df_item, on='order_id', how='left')
df_train = df_train.merge(df_order_pay, on='order_id', how='outer', validate='m:m')
df_train = df_train.merge(df_reviews, on='order_id', how='outer')
df_train = df_train.merge(df_products, on='product_id', how='outer')
df_train = df_train.merge(df_customers, on='customer_id', how='outer')
df_train = df_train.merge(df_sellers, on='seller_id', how='outer')

In [30]:
df_train['review_comment_message_en'] = None

In [2]:
train_df = pd.read_csv("../data/train.csv")

In [3]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# mBART 모델 불러오기 (포르투갈어 -> 영어)
model_name = 'facebook/mbart-large-50-many-to-many-mmt'

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# 모델이 번역할 소스 언어를 설정 (포르투갈어)
tokenizer.src_lang = "pt_XX"


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [3]:
# # 번역할 포르투갈어 문장

# text_pt = "Olá, como você está?"

# # 텍스트를 토큰화
# tokens = tokenizer(text_pt, return_tensors="pt")

# # 번역 수행
# translated = model.generate(**tokens, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])

# # 번역된 텍스트를 디코딩
# translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

# # 결과 출력
# print(translated_text)


['Hi, how are you?']


In [4]:
def translate_to_english(text_pt):
    if pd.isnull(text_pt):  # Null 값 처리
        return None
    tokens = tokenizer(text_pt, return_tensors="pt")
    translated = model.generate(**tokens, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translated_text[0]

In [6]:
comment_df = pd.DataFrame(train_df.loc[:,'review_comment_message'])

In [26]:
comment_df.shape

(119143, 1)

In [7]:
comment_df = comment_df.dropna()

In [8]:
comment_df.shape

(48905, 1)

In [9]:
comment_df = comment_df.drop_duplicates()

In [10]:
comment_df.shape

(35175, 1)

In [11]:
comment_df['review_comment_message_en'] = None

In [12]:
tqdm.pandas()  # tqdm의 progress_apply를 사용하기 위해 설정
comment_df['review_comment_message_en'] = comment_df['review_comment_message'].progress_apply(translate_to_english)

100%|██████████| 35175/35175 [87:26:23<00:00,  8.95s/it]    


In [14]:
comment_df['review_comment_message_en']

1         Black product, repacked in a white box, lots o...
5         WE'RE STILL EXPERIENCED WITH A RECOMMENDATION ...
6                                                Very good.
10        It arrives by the deadline. Very good product,...
11                                               excellent.
                                ...                        
116565    Thank you very much, a note 10 for your attent...
116566         I got it delivered on time. I appreciate it.
116570     That layer doesn't fit right. I can't change it.
116571    So he waited a long time for me to let him kno...
116572                                               Right.
Name: review_comment_message_en, Length: 35175, dtype: object

In [18]:
comment_df.to_csv("../data/comment_translation.csv", index=False)