# Transform from Bronze to Silver

## Perform Silver Operations on bronze_reviews to get silver_reviews

### Read bronze_reviews Parquet

In [9]:
df_bronze_reviews = spark.read.parquet("Files/Bronze/Toronto_Ontario_Canada/bronze_reviews.parquet")

# display(df_bronze_reviews.printSchema())
# display(df_bronze_reviews.limit(5))

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 11, Finished, Available)

### Import the types and functions

In [10]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 12, Finished, Available)

### Remove unwanted columns

In [11]:
df_silver_reviews = df_bronze_reviews['listing_id','id','date','reviewer_id','comments']
display(df_silver_reviews.limit(5))

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, abfc7149-7710-4d81-87ec-f61c7ce21e35)

### Change fields data type

In [12]:
review_coltype_map = {
    "listing_id": LongType(),
    "id": LongType(),
    "date": DateType(),
    "reviewer_id": IntegerType(),
    "comments": StringType(),
}

for col_name, col_type in review_coltype_map.items():
    df_silver_reviews = df_silver_reviews.withColumn(col_name, df_silver_reviews[col_name].cast(col_type))

df_silver_reviews.printSchema()

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 14, Finished, Available)

root
 |-- listing_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- reviewer_id: integer (nullable = true)
 |-- comments: string (nullable = true)



### Check for nulls

In [13]:
display(df_silver_reviews.filter(df_silver_reviews['listing_id'].isNull()))
display(df_silver_reviews.filter(df_silver_reviews['id'].isNull()))
display(df_silver_reviews.filter(df_silver_reviews['reviewer_id'].isNull()))

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 15, Finished, Available)

SynapseWidget(Synapse.DataFrame, d1518f63-5f51-49e6-9ea8-9fec746cf8c1)

SynapseWidget(Synapse.DataFrame, 8269051b-675b-42f4-96da-96157d54af42)

SynapseWidget(Synapse.DataFrame, 41f712f4-694f-40ca-8a0c-113b880eeeff)

### Rename columns

In [14]:
df_silver_reviews.columns

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 16, Finished, Available)

['listing_id', 'id', 'date', 'reviewer_id', 'comments']

In [15]:
df_silver_reviews = df_silver_reviews.withColumnsRenamed({
        'id': 'review_id',
        'date': 'review_date',
        'comments': 'review_comments'})

df_silver_reviews.columns

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 17, Finished, Available)

['listing_id', 'review_id', 'review_date', 'reviewer_id', 'review_comments']

### Trim review_comments column

In [16]:
df_silver_reviews = df_silver_reviews.withColumn('review_comments', trim(df_silver_reviews.review_comments))

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 18, Finished, Available)

### Replace value "< br/ >" with ""

In [17]:
display(df_silver_reviews.filter(df_silver_reviews['review_comments'].contains("<br/>") ).select('review_comments').limit(5))

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 19, Finished, Available)

SynapseWidget(Synapse.DataFrame, 36ca7327-df17-4ae7-b8a8-9446d9022642)

In [18]:
df_silver_reviews = df_silver_reviews.withColumn('review_comments', regexp_replace('review_comments', '<br/>', ''))

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 20, Finished, Available)

In [19]:
display(df_silver_reviews.filter(df_silver_reviews['review_comments'].contains("<br/>") ).select('review_comments').limit(5))

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 21, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6314bf91-c56b-4925-bd31-8c5073adfe00)

### Load to silver_reviews to Delta Table

In [20]:
display(df_silver_reviews.limit(5))

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 22, Finished, Available)

SynapseWidget(Synapse.DataFrame, f0cc3730-9f86-4401-a41f-313347f7d693)

In [21]:
df_silver_reviews.write.format("delta").mode("append").save("Tables/silver_reviews")

StatementMeta(, 4ce83310-6cbe-4818-9a0b-8e8fb2069b10, 23, Finished, Available)