In [None]:
# pip install googletrans==4.0.0-rc1 --timeout=120
# !pip install requests langdetect
!pip install argostranslate --timeout 100 # to translate the columns locally

In [1]:
import os
import findspark

# Set the Spark installation path
os.environ["SPARK_HOME"] = "/home/elham/hadoop/spark-3.5.5"  
os.environ["PYSPARK_PYTHON"] = "python3"  

# Initialize findspark
findspark.init()

In [2]:
# from googletrans import Translator
from pyspark import StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import pandas as pd
import numpy as np

import argostranslate.package
import argostranslate.translate
from argostranslate.translate import translate, package

In [15]:
## To display notebook cell with horizontal scroll bar
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
# getting all the available translation packages
packages = argostranslate.package.get_available_packages()

# specifying the languages
package = next(p for p in packages if p.from_code == "pt" and p.to_code == "en")

download_path = package.download()
argostranslate.package.install_from_path(download_path)

In [4]:
# Replace with the path to your PostgreSQL JDBC jar file
jdbc_driver_path = "/home/elham/hadoop/spark-3.5.5/jars/postgresql-42.3.5.jar"

# starting spark session
spark = SparkSession.builder \
    .appName("Order Reviews ETL") \
    .config("spark.jars", jdbc_driver_path) \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

25/04/13 12:42:48 WARN Utils: Your hostname, DESKTOP-CKKDTOM resolves to a loopback address: 127.0.1.1; using 172.30.19.133 instead (on interface eth0)
25/04/13 12:42:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/04/13 12:42:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
jdbc_url = "jdbc:postgresql://172.30.16.1:5432/olist_ecommerce"
properties = {
    "user": "postgres",
    "password": "31012009",
    "driver": "org.postgresql.Driver"
}

In [6]:
df = spark.read.jdbc(url=jdbc_url, table="order_reviews", properties=properties)
df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: timestamp (nullable = true)
 |-- review_answer_timestamp: timestamp (nullable = true)



In [7]:
df.show(10)

                                                                                

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:00|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                NULL|                  NULL| 2018-03-10 00:00:00|    2018-03-11 03:05:00|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|                NULL|                  NULL| 2018-02-17 00:00:00|    2018-02-18 14:36:00|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|                NULL|  Recebi bem antes ...| 2017-04-21 00:00:00|   

In [8]:
df.count()

                                                                                

98410

In [9]:
df.columns

['review_id',
 'order_id',
 'review_score',
 'review_comment_title',
 'review_comment_message',
 'review_creation_date',
 'review_answer_timestamp']

In [10]:
# Renaming Columns
df = df.withColumnRenamed("review_comment_title", "review_title")
df = df.withColumnRenamed("review_comment_message", "review_message")

In [11]:
df.describe().show()

25/04/13 12:44:08 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+-------+--------------------+--------------------+-----------------+--------------------+--------------------+
|summary|           review_id|            order_id|     review_score|        review_title|      review_message|
+-------+--------------------+--------------------+-----------------+--------------------+--------------------+
|  count|               98410|               98410|            98410|               11519|               40668|
|   mean|                NULL|                NULL|4.088801951021238|3.175483995866666...|   8.172413793103448|
| stddev|                NULL|                NULL|1.345757637797199| 5.63435692112386E11|  3.1175650470615843|
|    min|0001239bc1de2e33c...|00010242fe8c5a6d1...|                1|                    |                \r\n|
|    max|fffefe7a48d22f7b3...|fffe41c64501cc87c...|                5|                 🔟 |😡😡😡😡😡👎👎👎?...|
+-------+--------------------+--------------------+-----------------+--------------------+--------------------+



                                                                                

In [12]:
# checking the amount of null values in each column
df.select([sum(col(c).isNull().cast("int")).alias(c+"_null") for c in df.columns]).show()

[Stage 7:>                                                          (0 + 1) / 1]

+--------------+-------------+-----------------+-----------------+-------------------+-------------------------+----------------------------+
|review_id_null|order_id_null|review_score_null|review_title_null|review_message_null|review_creation_date_null|review_answer_timestamp_null|
+--------------+-------------+-----------------+-----------------+-------------------+-------------------------+----------------------------+
|             0|            0|                0|            86891|              57742|                        0|                           0|
+--------------+-------------+-----------------+-----------------+-------------------+-------------------------+----------------------------+



                                                                                

In [17]:
# replacing null values with placeholders
df = df.fillna({
    "review_title": "No Title",
    "review_message": "No Comment"
})

In [18]:
df.show()

+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|        review_title|      review_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|            No Title|          No Comment| 2018-01-18 00:00:00|    2018-01-18 21:46:00|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|            No Title|          No Comment| 2018-03-10 00:00:00|    2018-03-11 03:05:00|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|            No Title|          No Comment| 2018-02-17 00:00:00|    2018-02-18 14:36:00|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|            No Title|Recebi bem antes ...| 2017-04-21 00:00:00|    2017-04-21 22

                                                                                

In [19]:
# cleaning strings by making it all lowercase, removing extra spaces at the beginning and ending and replacing any extra spaces that might exist 
df = df.withColumn("review_title", lower(trim(regexp_replace("review_title", r'\s+', ' '))))
df = df.withColumn("review_message", lower(trim(regexp_replace("review_message", r'\s+', ' '))))

In [16]:
df.show()

+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|        review_title|      review_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|            no title|          no comment| 2018-01-18 00:00:00|    2018-01-18 21:46:00|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|            no title|          no comment| 2018-03-10 00:00:00|    2018-03-11 03:05:00|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|            no title|          no comment| 2018-02-17 00:00:00|    2018-02-18 14:36:00|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|            no title|recebi bem antes ...| 2017-04-21 00:00:00|    2017-04-21 22

In [20]:
# removing commas from string columns
for column, dtype in df.dtypes:
    if dtype == "string":
        df = df.withColumn(column, regexp_replace(col(column), ",", ""))

# Optional: Show cleaned data
df.show(truncate=False)

+--------------------------------+--------------------------------+------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------+-----------------------+
|review_id                       |order_id                        |review_score|review_title          |review_message                                                                                                                                                                |review_creation_date|review_answer_timestamp|
+--------------------------------+--------------------------------+------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------+-----------------------+
|7bc2406110b926393aa56f80a40

In [21]:
# adding the column review status derived from the review score column
df = df.withColumn(
    "review_status",
     when(col("review_score") >= 4, "Positive")
      .when(col("review_score") == 3, "Neutral")
      .otherwise("Negative")
)

In [22]:
df.select("review_id", "review_status").show()

+--------------------+-------------+
|           review_id|review_status|
+--------------------+-------------+
|7bc2406110b926393...|     Positive|
|80e641a11e56f04c1...|     Positive|
|228ce5500dc1d8e02...|     Positive|
|e64fb393e7b32834b...|     Positive|
|f7c4243c7fe1938f1...|     Positive|
|15197aa66ff4d0650...|     Negative|
|07f9bee5d1b850860...|     Positive|
|7c6400515c67679fb...|     Positive|
|a3f6f7f6f433de0ae...|     Positive|
|8670d52e15e00043a...|     Positive|
|c9cfd2d5ab5911836...|     Positive|
|96052551d87e5f62e...|     Positive|
|4b49719c8a200003f...|     Positive|
|23f75a37effc35d9a...|     Positive|
|9a0abbb668bafb95a...|      Neutral|
|3948b09f7c818e2d8...|     Positive|
|9314d6f9799f5bfba...|     Negative|
|8e15a274d95600fa1...|     Positive|
|fdbdb2629a7cde0f6...|      Neutral|
|373cbeecea8286a2b...|     Negative|
+--------------------+-------------+
only showing top 20 rows



In [26]:
df = df.withColumn(
    "SSC",
    lit(1)
)

In [27]:
df.show()

+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----------------------+-------------+---+
|           review_id|            order_id|review_score|        review_title|      review_message|review_creation_date|review_answer_timestamp|review_status|SSC|
+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----------------------+-------------+---+
|7bc2406110b926393...|73fc7af87114b3971...|           4|            no title|          no comment| 2018-01-18 00:00:00|    2018-01-18 21:46:00|     Positive|  1|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|            no title|          no comment| 2018-03-10 00:00:00|    2018-03-11 03:05:00|     Positive|  1|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|            no title|          no comment| 2018-02-17 00:00:00|    2018-02-18 14:36:00|     Positive|  1|
|e64fb393e7b32834b...|658677

                                                                                

In [28]:
# Loop through all columns and print distinct values
for column in df.columns:
    print(f"\nUnique values in column '{column}':")
    df.select(column).distinct().show(truncate=False)



Unique values in column 'review_id':


                                                                                

+--------------------------------+
|review_id                       |
+--------------------------------+
|228ce5500dc1d8e020d8d1322874b6f0|
|19a52a2297e5289ab4187d238d335982|
|c92cdd7dd544a01aa35137f901669cdf|
|b42460e68676135205af8b8c13baf6ae|
|c45811d9f90e22a81155b3a1e4a5c2e8|
|edfbad2944ea758725486c3bbcd04f49|
|46d8249ea59101c7288936c3e6f155d7|
|0dc56be1dabd04c994a0fce5478bb16d|
|072aa82842434b050889c89a33c699a7|
|57e994485c2ebd293e4fa9cd62e5bb96|
|ca21ff9bdc64ce15d2be8e09bbc5a115|
|06f47dacd0575e592bf6a17e922ce6cd|
|a89a2b0c2014bc49a0f87f1eedb7b269|
|a36150969fe6bd09ed011dd78aeb0189|
|3528296df9092b7a7b88bd036e3f8a8e|
|c239d18d7e310e477d5e7b76b362db1d|
|31a346e6f2cf783b145080e2f4dc6e32|
|f7eb8792e67d53e01a224bfa9b6802b2|
|76a2c91e6bf615fe58abe97b55a19ee1|
|11caa17c5966423a2f73ded6a13ad638|
+--------------------------------+
only showing top 20 rows


Unique values in column 'order_id':


                                                                                

+--------------------------------+
|order_id                        |
+--------------------------------+
|658677c97b385a9be170737859d3511b|
|c31a859e34e3adac22f376954e19b39d|
|84cec4f65c7a4f2f54c294a30224a594|
|366df9c0b0d5d46f8afc476dc4ca7671|
|a9a75c64cb5e9f7bc92e68ec13f05c45|
|a1341cb83bbf1b47392f4a3685d56bad|
|46d2d651f006b9b9440d1aa37d89c894|
|a6ab143f24e590dc7c72a62bc4eab3af|
|b8d7fee0274d75202bee63e9e71fec3f|
|201691efe89d4d71f36e5ea279a8759a|
|fc8c2e8bc069aea361c41c313464993c|
|15c8f741c0b7deafafea8af0f7bb6852|
|0bcbcc3e0a09b3f6a9495de35f452616|
|e239d280236cdd3c40cb2c033f681d1c|
|02bd92abe094825e0683bacaacb64285|
|d451da9b109e1786f303924d04ed72a1|
|51c7edabb9739b6998ee68efb7e10d31|
|a5a83c95ed669b7ba0ddce1d761c191f|
|1b82cfd1e6a6d682ee254b2ff4cf387f|
|c99763ba8bcad2845131b6e9d2f203fc|
+--------------------------------+
only showing top 20 rows


Unique values in column 'review_score':
+------------+
|review_score|
+------------+
|5           |
|1           |
|3           |
|2 

                                                                                

+------------------------+
|review_title            |
+------------------------+
|ótimo produto!          |
|recomendo sempre!!      |
|super satisfeita.       |
|produto incorreto       |
|recomendo!              |
|produto otimo!!         |
|produto e prazo         |
|maquina de tosa         |
|embalagem inadequada    |
|cortina para sala       |
|já solicitei a devolução|
|muito satisfeito        |
|produto veio sem peça   |
|carregador com defeito  |
|vc paga e não entregam  |
|avaliçao                |
|avarias no produto      |
|parabéns a equipe!      |
|recomendo!!!            |
|ótimo custo benefício   |
+------------------------+
only showing top 20 rows


Unique values in column 'review_message':


                                                                                

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|review_message                                                                                                                                                                             |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|chegou apenas uma peça mas na nota e na garantia constam as duas joias.                                                                                                                    |
|atendeu minha expectativa.                                                                                                                                                                 |
|o meu produto não chegou... segundo o rastreio es

                                                                                

+--------------------+
|review_creation_date|
+--------------------+
|2017-12-15 00:00:00 |
|2018-05-10 00:00:00 |
|2018-06-03 00:00:00 |
|2018-03-08 00:00:00 |
|2017-09-17 00:00:00 |
|2018-02-01 00:00:00 |
|2017-08-18 00:00:00 |
|2017-12-29 00:00:00 |
|2017-04-06 00:00:00 |
|2018-05-05 00:00:00 |
|2018-05-12 00:00:00 |
|2018-07-04 00:00:00 |
|2017-07-26 00:00:00 |
|2017-05-16 00:00:00 |
|2017-10-03 00:00:00 |
|2017-04-20 00:00:00 |
|2018-01-13 00:00:00 |
|2018-05-04 00:00:00 |
|2017-06-21 00:00:00 |
|2017-11-28 00:00:00 |
+--------------------+
only showing top 20 rows


Unique values in column 'review_answer_timestamp':
+-----------------------+
|review_answer_timestamp|
+-----------------------+
|2018-08-14 21:36:00    |
|2018-08-11 00:22:00    |
|2017-08-01 18:43:00    |
|2018-06-19 17:44:00    |
|2018-01-14 22:44:00    |
|2018-02-23 11:06:00    |
|2018-03-31 03:25:00    |
|2018-01-30 18:19:00    |
|2018-08-03 04:55:00    |
|2018-03-17 02:34:00    |
|2018-04-13 11:56:00    |
|2017-

In [54]:
# final cleaning data before translation
# List of string columns
string_columns = [field.name for field in df.schema.fields if field.dataType.simpleString() == 'string']

# Clean each string column
for col_name in string_columns:
    df = df.withColumn(
        col_name,
        regexp_replace(trim(lower(col(col_name))), r'["\n\r\t,]', '')  # removes ", \n, \r, \t, and ,
    )

df.show(truncate=False)


[Stage 74:>                                                         (0 + 1) / 1]

+--------------------------------+--------------------------------+------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------+-----------------------+-------------+---+
|review_id                       |order_id                        |review_score|review_title          |review_message                                                                                                                                                                |review_creation_date|review_answer_timestamp|review_status|SSC|
+--------------------------------+--------------------------------+------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------+-----------------

                                                                                

In [40]:
# adding the surrogate key
dim_review = df.select(
    monotonically_increasing_id().alias("review_sk"),
    "review_id",
    "order_id",
    "review_score",
    "review_title",
    "review_message",
    "review_status",
    "review_creation_date",
    "review_answer_timestamp",
    "SSC"
    
)

In [41]:
dim_review.show()

+---------+--------------------+--------------------+------------+--------------------+--------------------+-------------+--------------------+-----------------------+---+
|review_sk|           review_id|            order_id|review_score|        review_title|      review_message|review_status|review_creation_date|review_answer_timestamp|SSC|
+---------+--------------------+--------------------+------------+--------------------+--------------------+-------------+--------------------+-----------------------+---+
|        0|7bc2406110b926393...|73fc7af87114b3971...|           4|            no title|          no comment|     Positive| 2018-01-18 00:00:00|    2018-01-18 21:46:00|  1|
|        1|80e641a11e56f04c1...|a548910a1c6147796...|           5|            no title|          no comment|     Positive| 2018-03-10 00:00:00|    2018-03-11 03:05:00|  1|
|        2|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|            no title|          no comment|     Positive| 2018-02-17 00:00:

## Translating the review title and review comment columns to English in new columns:

In [43]:
translator_broadcast = spark.sparkContext.broadcast(translate)

In [44]:
@pandas_udf('string')
def translate_to_english(text_series: pd.Series) -> pd.Series:
    translate = translator_broadcast.value
    mask = ~text_series.isin(["", "no title", "no comment"]) & text_series.notna()
    text_series[mask] = np.vectorize(translate)(text_series[mask].astype(str), "pt", "en")
    return text_series

In [45]:
# filtering the dataframe
to_translate = df.filter(
    (col("review_title") != "no title") & 
    (col("review_message") != "no comment")
).repartition(16).cache() 

25/04/13 12:59:12 WARN CacheManager: Asked to cache already cached data.


In [46]:
to_translate.count()

9798

In [47]:
translated = to_translate.withColumn(
    "review_title_en", 
    translate_to_english(col("review_title"))
).withColumn(
    "review_message_en", 
    translate_to_english(col("review_message"))
)

In [48]:
translated.count()

9798

In [49]:
dim_review = dim_review.join(
    translated.select("review_id", "review_title_en", "review_message_en"),
    on="review_id",
    how="left"
).fillna({
    "review_title_en": "no title",
    "review_message_en": "no comment"
})

In [33]:
# rearranging columns
review_dim = dim_review.select(
    "review_sk",
    "review_id",
    "review_title",
    "review_title_en",
    "review_message",
    "review_message_en",
     "review_score",
    "review_status",
    "review_creation_date",
    "review_answer_timestamp",
    "SSC"
)

In [34]:
review_dim.printSchema()

root
 |-- review_sk: long (nullable = false)
 |-- review_id: string (nullable = true)
 |-- review_title: string (nullable = false)
 |-- review_title_en: string (nullable = false)
 |-- review_message: string (nullable = false)
 |-- review_message_en: string (nullable = false)
 |-- review_score: integer (nullable = true)
 |-- review_status: string (nullable = false)
 |-- review_creation_date: timestamp (nullable = true)
 |-- review_answer_timestamp: timestamp (nullable = true)
 |-- SSC: integer (nullable = false)



In [36]:
# saving the review_dim data in csv file
review_dim.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("output/review_dim.csv")

                                                                                