In [0]:
%pip install nltk
%pip install langdetect
%pip install googletrans==4.0.0-rc1
%pip install demoji

In [0]:
import nltk

In [0]:
# Download the 'punkt' tokenizer model (only needs to be done once)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [0]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, current_timestamp, lower
from pyspark.sql.types import StructType, IntegerType
import pandas as pd
from langdetect import detect, LangDetectException
from googletrans import Translator
import demoji
import re


In [0]:
spark = SparkSession.builder.appName("playstore_load").getOrCreate()

In [0]:
bronze_df = spark.table("workspace.growth_poc.bronze_playstore_reviews")
bronze_df.printSchema()

In [0]:
# enforce schema
int_columns = ['score', 'thumbsUpCount']
datetime_columns = ['at', 'repliedAt']

schema_enforced_df = bronze_df

for c in int_columns:
    schema_enforced_df = schema_enforced_df.withColumn(c, col(c).cast(IntegerType()))
for c in datetime_columns:
    schema_enforced_df = schema_enforced_df.withColumn(c, to_timestamp(col(c)))


In [0]:
def remove_urls(text):
    # Regex pattern to match URL formats (http, https, www)
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


In [0]:
# convert spark dataframe to pandas dataframe to perfrom row level content split
pd_df = schema_enforced_df.toPandas()

In [0]:
translator = Translator()
stop_words = set(stopwords.words('english'))

new_rows = []
for index, row in pd_df.iterrows():
    content_to_detect = remove_urls(demoji.replace(row["content"], ""))

    try:
        detected_language = detect(content_to_detect)
        # filer out Korean reviews to focus on foreginers' experience
        if detected_language != "ko":
            # if review was written in english, skip translation
            if detected_language == "en":
                written_language = "en"
                content_translated = content_to_detect.lower()
            else:
                # Translate reviews into english to easier processing 
                trans_result = translator.translate(content_to_detect, dest="en")
                written_language = trans_result.src
                content_translated = trans_result.text.lower()
            # split content into sentence
            sentences = sent_tokenize(content_translated)
            words = word_tokenize(content_translated)
            filtered_words = [w for w in words if w not in stop_words]
            new_rows.append(
                {
                    "appName":row["appName"],
                    "appVersion":row["appVersion"],
                    "at":row["at"],
                    "content":row["content"],
                    "content_translated": content_translated,
                    "sentences": sentences,
                    "words": filtered_words,
                    "language":written_language,
                    "repliedAt":row["repliedAt"],
                    "replyContent":row["replyContent"],
                    "reviewCreatedVersion":row["reviewCreatedVersion"],
                    "reviewId":row["reviewId"],
                    "score":row["score"],
                    "thumbsUpCount":row["thumbsUpCount"],
                    "userImage":row["userImage"],
                    "userName":row["userName"]
                })
    except:
        continue

        

In [0]:
tokenized_df = pd.DataFrame(new_rows)
spark_df = spark.createDataFrame(tokenized_df).withColumn("timestamp", current_timestamp())

In [0]:
spark_df.write\
        .mode("overwrite")\
        .format("delta")\
        .option("mergeSchema", "true")\
        .saveAsTable("workspace.growth_poc.silver_playstore_reviews")