In [0]:
#%pip install langdetect
#%pip install kss
#%pip install nltk 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp
from langdetect import detect
from datetime import datetime
import pandas as pd
import kss
import nltk


In [0]:
# Download the 'punkt' tokenizer model (only needs to be done once)
nltk.download('punkt')
nltk.download('punkt_tab')

In [0]:
from nltk.tokenize import sent_tokenize

In [0]:
spark = SparkSession.builder.appName("silver_reddit_load").getOrCreate()

In [0]:
# get data from bronze table
bronze_df = spark.table("workspace.growth_poc.bronze_reddit")

In [0]:
# Convert Spark DataFrame to Pandas DataFrame 
# (need to run row-level language detection with langdetect)
pd_df = bronze_df.toPandas().rename(columns = {"created_datetime": "created_datetime_unix"})
pd_df['created_datetime'] = pd.to_datetime(pd_df['created_datetime_unix'].astype(float), unit='s')

In [0]:
def detect_language(text):
    if not text or str(text).strip() == "":
        return "n/a"
    try:
        wrttien_language = detect(text)
        return wrttien_language
    # if text is not detectable, put n/a
    except:
        return "n/a"

In [0]:
# detect written language
pd_df['language'] = pd_df['content'].apply(detect_language)

In [0]:
new_rows = []
# loop through each row
for index, row in pd_df.iterrows():
    # for korean, use kss 
    if(row['language']) == 'kr':
        sentences = kss.split_sentences(row['content'])
    # everything else, use nltk for sentence separation
    else:
        sentences = sent_tokenize(row['content'])       
    for sentence in sentences:
            new_rows.append({
                'url': row['url'],
                'content': row['content'],  
                'sentence': sentence,       
                'created_datetime_unix': row['created_datetime_unix'],
                'score': row['score'],
                'category': row['category'],
                'language': row['language'],
                'created_datetime': row['created_datetime']
            })
# create dataframe
sentence_df = pd.DataFrame(new_rows)

In [0]:
# add timestamp
spark_df = spark.createDataFrame(sentence_df).withColumn("timestamp", current_timestamp())

In [0]:
spark_df.write\
        .mode("overwrite")\
        .format("delta")\
        .option("mergeSchema", "true")\
        .saveAsTable("workspace.growth_poc.silver_reddit_reviews")