In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, ArrayType, LongType
import pyspark.sql.functions as f
import re
import requests
from bs4 import BeautifulSoup

In [None]:
# Set GCS bucket name
gcs_bucket = f'amazon_reviews_bucket'
gcs_filepath = f'gs://{gcs_bucket}'

In [None]:
# Create Spark Session and Load BigQuery jar file
spark = SparkSession \
    .builder \
    .appName("Load Review Files") \
    .config("spark.jars", "/opt/spark/jars/spark-bigquery-with-dependencies_2.12-0.20.0.jar") \
    .getOrCreate()

In [None]:
# Define schema of files to parse
# Auto-parse fails due to duplicate keys in the style Array
# Need to parse in as array
schema = StructType([ 
    StructField("asin",StringType(),True), 
    StructField("image",ArrayType(StringType()),True), 
    StructField("overall",DoubleType(),True),
    StructField("reviewText",StringType(),True),
    StructField("reviewTime",StringType(),True),
    StructField("reviewerID",StringType(),True),
    StructField("reviewerName",StringType(),True),
    StructField("summary",StringType(),True),
    StructField("unixReviewTime",LongType(),True),
    StructField("verified",BooleanType(),True),
    StructField("vote",StringType(),True),
    StructField("style",ArrayType(StringType()),True)
  ])

In [None]:
# URL to scrape to get files to download
url = "https://nijianmo.github.io/amazon/index.html"
html = requests.get(url)

# Get HTML from website
if html.ok:
    soup = BeautifulSoup(html.content, 'html.parser')  

# Parse website for '5-core' links and add the file to the SparkContext for later download
# Store list of file names 
output_final = []
files = []
links = soup.find_all('a',string='5-core')#.find('5-core')#.find_all('td', id='5-core')
for link in links:
    url = link.get('href')
    file = url.split('/')[-1]
    print(url)
    print(url.split('/')[-1])
    spark.sparkContext.addFile(url)
    files.append(file)

# Loop through each file, delete duplicates based on multiple product ID's for the same product, differnt color
# Calculate word count of review
# Create standard fields and load in to GCP Bucket
loaded = []
for file in files:    
    df = spark.read.json("file://"+SparkFiles.get(file),schema)
    df = df.dropDuplicates(['reviewerID','overall','summary','reviewText']) 
    df = df.withColumn('review_wordCount', f.size(f.split(f.col('reviewText'), ' ')))
    df.registerTempTable("dataframe")
    sql_script = f"""select 
              '{file.split('.')[0]}' as category,
              asin || '-' || reviewerID || '-' || row_number() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime asc) as review_ID,
              asin as product_ID,
              reviewerID as reviewer_ID,
              overall as rating_out_of_5,
              summary as review_summary,
              reviewText as review_text,
              review_wordCount as review_word_count,  
              verified,
              vote,
              reviewTime,
              unixReviewTime,
              image,
              '{url}' as source_url
            from dataframe"""
    output = spark.sql(sql_script)
    print(f'Data load started for {file}')
    output.write \
      .format("bigquery") \
      .option("temporaryGcsBucket",gcs_bucket) \
      .mode("append") \
      .save("amazon_reviews.categoryFilesSmall")
    output.unpersist()
    print(f'Data loaded to Bigquery for {file}')
    loaded.append(file)