In [0]:
#%pip install nltk 

In [0]:
import nltk

In [0]:
# Download the 'punkt' tokenizer model (only needs to be done once)
nltk.download('punkt')
nltk.download('punkt_tab')

In [0]:
from nltk.tokenize import sent_tokenize

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, current_timestamp
from pyspark.sql.types import StructType, IntegerType
import pandas as pd

In [0]:

spark = SparkSession.builder.appName("playstore_load").getOrCreate()

In [0]:
bronze_df = spark.table("workspace.growth_poc.bronze_appstore_reviews")
bronze_df.printSchema()

In [0]:
def flatten_columns(schema, prefix = ""):
    cols = []
    for field in schema.fields:
        column_name = field.name
        if prefix != "":
            column_name = f"{prefix}.{column_name}"
        # if field is struct type
        if isinstance(field.dataType, StructType):
            # recursive call to handle nested struct
            cols += flatten_columns(field.dataType, column_name) 
        # if field is already flatten, add to column
        else:
            cols.append(col(column_name).alias(column_name.replace('.', '_').replace('_label', '').replace('im:', '')))

    return cols

#im:voteCount: The total number of customers who have voted on a review, 
# regardless of whether they chose "Helpful" or "Not Helpful".
#im:voteSum: This is a related value that tallies only the number of users who found a review helpful.        

rename_columns = {
    "voteCount": "totalVote",
    "voteSum": "thumbsUpCount"
}
flat_df= bronze_df.select(flatten_columns(bronze_df.schema)).withColumnsRenamed(rename_columns)


In [0]:
# enforce schema
int_columns = ['rating', 'thumbsUpCount', 'totalVote']
datetime_columns = ['updated']

for c in int_columns:
    flat_df = flat_df.withColumn(c, col(c).cast(IntegerType()))

for c in datetime_columns:
    flat_df = flat_df.withColumn(c, to_timestamp(c))


In [0]:
# convert spark dataframe to pandas dataframe to perfrom row level content split
pd_df = flat_df.toPandas()

In [0]:
new_rows = []
for index, row in pd_df.iterrows():
    # I will count title as a content as well.
    new_rows.append(
        {
            "author":row["author"],
            "author_name":row["author_name"],
            "author_uri":row["author_uri"],
            "updated":row["updated"],
            "rating":row["rating"],
            "version":row["version"],
            "id":row["id"],
            "title":row["title"],
            "content_attributes_type":row["content_attributes_type"],
            "content":row["content"],
            "sentence": row["title"],
            "link_attributes_href":row["link_attributes_href"],
            "link_attributes_rel":row["link_attributes_rel"],
            "thumbsUpCount":row["thumbsUpCount"],
            "contentType_attributes":row["contentType_attributes"],
            "contentType_attributes_term":row["contentType_attributes_term"],
            "totalVote":row["totalVote"],
            "appName":row["appName"],
            "country":row["country"],
            "language":row["language"],
        })
    sentences = sent_tokenize(row['content'])
    for sentence in sentences:
        new_rows.append(
        {
            "author":row["author"],
            "author_name":row["author_name"],
            "author_uri":row["author_uri"],
            "updated":row["updated"],
            "rating":row["rating"],
            "version":row["version"],
            "id":row["id"],
            "title":row["title"],
            "content_attributes_type":row["content_attributes_type"],
            "content":row["content"],
            "sentence": sentence,
            "link_attributes_href":row["link_attributes_href"],
            "link_attributes_rel":row["link_attributes_rel"],
            "thumbsUpCount":row["thumbsUpCount"],
            "contentType_attributes":row["contentType_attributes"],
            "contentType_attributes_term":row["contentType_attributes_term"],
            "totalVote":row["totalVote"],
            "appName":row["appName"],
            "country":row["country"],
            "language":row["language"],
        })

In [0]:
sentence_df = pd.DataFrame(new_rows)
spark_df = spark.createDataFrame(sentence_df).withColumn("timestamp", current_timestamp())

In [0]:
spark_df.write\
        .mode("overwrite")\
        .format("delta")\
        .option("mergeSchema", "true")\
        .saveAsTable("workspace.growth_poc.silver_appstore_reviews")