In [0]:
%pip install nltk 

In [0]:
import nltk

In [0]:
# Download the 'punkt' tokenizer model (only needs to be done once)
nltk.download('punkt')
nltk.download('punkt_tab')

In [0]:
from nltk.tokenize import sent_tokenize

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, current_timestamp
import pandas as pd

In [0]:

spark = SparkSession.builder.appName("playstore_load").getOrCreate()

In [0]:
bronze_df = spark.table("workspace.growth_poc.bronze_playstore_reviews")
bronze_df.printSchema()

In [0]:
# enforce schema
int_columns = ['score', 'thumbsUpCount']
datetime_columns = ['at', 'repliedAt']

schema_enforced_df = bronze_df

for c in int_columns:
    schema_enforced_df = schema_enforced_df.withColumn(c, col(c).cast(IntegerType()))
for c in datetime_columns:
    schema_enforced_df = schema_enforced_df.withColumn(c, to_timestamp(col(c)))


In [0]:
# convert spark dataframe to pandas dataframe to perfrom row level content split
pd_df = schema_enforced_df.toPandas()

In [0]:
new_rows = []
for index, row in pd_df.iterrows():
    sentences = sent_tokenize(row['content'])
    for sentence in sentences:
        new_rows.append(
        {
            "appName":row["appName"],
            "appVersion":row["appVersion"],
            "at":row["at"],
            "content":row["content"],
            "sentence": sentence,
            "language":row["language"],
            "repliedAt":row["repliedAt"],
            "replyContent":row["replyContent"],
            "reviewCreatedVersion":row["reviewCreatedVersion"],
            "reviewId":row["reviewId"],
            "score":row["score"],
            "thumbsUpCount":row["thumbsUpCount"],
            "userImage":row["userImage"],
            "userName":row["userName"]
        })

In [0]:
sentence_df = pd.DataFrame(new_rows)
spark_df = spark.createDataFrame(sentence_df).withColumn("timestamp", current_timestamp())

In [0]:
spark_df.write\
        .mode("overwrite")\
        .format("delta")\
        .option("mergeSchema", "true")\
        .saveAsTable("workspace.growth_poc.silver_playstore_reviews")