In [0]:
%pip install langdetect
%pip install nltk 
%pip install googletrans==4.0.0-rc1
%pip install demoji

In [0]:
import nltk

In [0]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [0]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lower
from datetime import datetime
import pandas as pd
from langdetect import detect, LangDetectException
from googletrans import Translator
import demoji
import re



In [0]:
spark = SparkSession.builder.appName("silver_reddit_load").getOrCreate()

In [0]:
# get data from bronze table
bronze_df = spark.table("workspace.growth_poc.bronze_reddit")

In [0]:
# Convert Spark DataFrame to Pandas DataFrame 
# (need to run row-level language detection with langdetect)
pd_df = bronze_df.toPandas().rename(columns = {"created_datetime": "created_datetime_unix"})
pd_df['created_datetime'] = pd.to_datetime(pd_df['created_datetime_unix'].astype(float), unit='s')

In [0]:
def remove_urls(text):
    # Regex pattern to match URL formats (http, https, www)
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


In [0]:
def detect_language(text):
    text_to_detect = remove_urls(demoji.replace(text, "")) 
    if not text_to_detect or str(text_to_detect).strip() == "":
        return "n/a"
    try:
        wrttien_language = detect(text_to_detect)
        return wrttien_language
    # if text is not detectable, put n/a
    except:
        return "n/a"

In [0]:
# detect written language
pd_df['language'] = pd_df['content'].apply(detect_language)

In [0]:
translator = Translator()
stop_words = set(stopwords.words('english'))

new_rows = []
# loop through each row
for index, row in pd_df.iterrows():
    # filter out korean reviews to focus on foreigners' experience
    if row['language'] != "kr":
        # if review was written in english, skip translation
        if row['language'] == "en":
            written_language = "en"
            content_translated = row['content'].lower()
        else:
            content_to_translate = remove_urls(demoji.replace(row['content'], ""))
            try:
                trans_result = translator.translate(content_to_translate, dest = "en")
                written_language = trans_result.src
                content_translated = trans_result.text.lower()
            except:
                continue

        # split content into sentence
        sentences = sent_tokenize(content_translated)
        words = word_tokenize(content_translated)
        filtered_words = [w for w in words if w not in stop_words]
        new_rows.append({
                'url': row['url'],
                'content': row['content'],
                'content_translated': content_translated,  
                'sentences': sentences,
                'words': filtered_words,       
                'created_datetime_unix': row['created_datetime_unix'],
                'score': row['score'],
                'category': row['category'],
                'language': written_language,
                'created_datetime': row['created_datetime']
            })

# create dataframe
sentence_df = pd.DataFrame(new_rows)

In [0]:
# add timestamp
spark_df = spark.createDataFrame(sentence_df).withColumn("timestamp", current_timestamp())

In [0]:
spark_df.write\
        .mode("overwrite")\
        .format("delta")\
        .option("mergeSchema", "true")\
        .saveAsTable("workspace.growth_poc.silver_reddit_reviews")