In [None]:
#%env GOOGLE_APPLICATION_CREDENTIALS=/home/pitfox/data/spark-container-dev-f5d53ab2439c.json

In [None]:
!echo $GOOGLE_APPLICATION_CREDENTIALS

In [None]:
!echo $SPARK_HOME

In [None]:
!cat $SPARK_HOME/conf/spark-defaults.conf

In [1]:
# import findspark
# findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, ArrayType, LongType
import pyspark.sql.functions as f
import re
import requests
from bs4 import BeautifulSoup

In [None]:
#spark.stop()

In [2]:
spark = SparkSession \
    .builder \
    .appName("Load Review Files") \
    .config("spark.jars", "/opt/spark/jars/spark-bigquery-with-dependencies_2.12-0.20.0.jar") \
    .getOrCreate()

In [3]:
# Update to your GCS bucket
gcs_bucket = f'amazon_reviews_bucket'
gcs_filepath = f'gs://amazon_reviews_bucket'

In [4]:
# Define schema of files to parse
schema = StructType([ 
    StructField("asin",StringType(),True), 
    StructField("image",ArrayType(StringType()),True), 
    StructField("overall",DoubleType(),True),
    StructField("reviewText",StringType(),True),
    StructField("reviewTime",StringType(),True),
    StructField("reviewerID",StringType(),True),
    StructField("reviewerName",StringType(),True),
    StructField("summary",StringType(),True),
    StructField("unixReviewTime",LongType(),True),
    StructField("verified",BooleanType(),True),
    StructField("vote",StringType(),True)
  ])

In [None]:
# URL to scrape to get files to download
url = "https://nijianmo.github.io/amazon/index.html"
html = requests.get(url)

if html.ok:
    soup = BeautifulSoup(html.content, 'html.parser')  

output_final = []
files = []
links = soup.find_all('a',string='5-core')#.find('5-core')#.find_all('td', id='5-core')
for link in links:
    url = link.get('href')
    file = url.split('/')[-1]
    print(url)
    print(url.split('/')[-1])
    spark.sparkContext.addFile(url)
    files.append(file)

for file in files:    
    df = spark.read.json("file://"+SparkFiles.get(file),schema)
    df = df.dropDuplicates() 
    df = df.withColumn('review_wordCount', f.size(f.split(f.col('reviewText'), ' ')))
    df.registerTempTable("dataframe")
    sql_script = f"""select 
              '{file}' as category,
              asin || '-' || reviewerID || row_number() OVER (PARTITION BY asin, reviewerID ORDER BY unixReviewTime asc) as review_ID,
              asin as product_ID,
              reviewerID as reviewer_ID,
              overall as rating_out_of_5,
              summary as review_summary,
              reviewText as review_text,
              review_wordCount as review_word_count,     
              '{url}' as source_url
            from dataframe"""
    output = spark.sql(sql_script)
    if not output_final:
        output_final = output
        print(f'loaded first file: {file}')
    else:
        output_final = output_final.union(output)
        print(f'appended to df with: {file}')  

http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/AMAZON_FASHION_5.json.gz
AMAZON_FASHION_5.json.gz
http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty_5.json.gz
All_Beauty_5.json.gz
http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Appliances_5.json.gz
Appliances_5.json.gz
http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Arts_Crafts_and_Sewing_5.json.gz
Arts_Crafts_and_Sewing_5.json.gz
http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Automotive_5.json.gz
Automotive_5.json.gz
http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Books_5.json.gz
Books_5.json.gz
http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/CDs_and_Vinyl_5.json.gz
CDs_and_Vinyl_5.json.gz
http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz
Cell_Phones_and_Accessories_5.json.gz
http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Clothing_Shoes_and_Jewelry_5.json.gz
Clothing_Shoes_and_Jewelry_5.json.gz
http:/

In [None]:
output_final.show(5)

In [None]:
output_final.write \
  .format("bigquery") \
  .option("temporaryGcsBucket",gcs_bucket) \
  .mode("append") \
  .save("amazon_reviews.categoryFilesSmall")

In [None]:
# spark.sparkContext._jvm.scala.util.Properties.versionString()

In [None]:
category = ['AMAZON_FASHION','All_Beauty']

In [None]:
url = f"http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/{category[1]}_5.json.gz"

In [None]:
spark.sparkContext.addFile(url)

In [None]:
df = spark.read.json("file://"+SparkFiles.get(f"{category[1]}_5.json.gz"))

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.types import StructType,StructField, StringType, DoubleType, BooleanType, ArrayType, LongType

In [None]:
schema = StructType([ 
    StructField("asin",StringType(),True), 
    StructField("image",ArrayType(StringType()),True), 
    StructField("overall",DoubleType(),True),
    StructField("reviewText",StringType(),True),
    StructField("reviewTime",StringType(),True),
    StructField("reviewerID",StringType(),True),
    StructField("reviewerName",StringType(),True),
    StructField("summary",StringType(),True),
    StructField("unixReviewedTime",LongType(),True),
    StructField("verified",BooleanType(),True),
    StructField("vote",StringType(),True)
  ])

In [None]:
df.show(10)

In [None]:
df = df.dropDuplicates() 

In [None]:
df = df.dropDuplicates() 
df = df.withColumn('review_wordCount', f.size(f.split(f.col('reviewText'), ' ')))
df.registerTempTable("dataframe")
sql_script = f"""select 
          '{category[1]}' as category,
          asin || '-' || reviewerID || row_number() OVER (PARTITION BY asin, reviewerID ORDER BY unixReviewTime asc) as review_ID,
          asin as product_ID,
          reviewerID as reviewer_ID,
          overall as rating_out_of_5,
          summary as review_summary,
          reviewText as review_text,
          review_wordCount as review_word_count,     
          '{url}' as source_url
        from dataframe"""
output = spark.sql(sql_script)


In [None]:
df.registerTempTable("dataframe")

In [None]:
sql_script = f"""select 
          '{category[1]}' as category,
          asin || '-' || reviewerID || row_number() OVER (PARTITION BY asin, reviewerID ORDER BY unixReviewTime asc) as review_ID,
          asin as product_ID,
          reviewerID as reviewer_ID,
          overall as rating_out_of_5,
          summary as review_summary,
          reviewText as review_text,
          review_wordCount as review_word_count,     
          '{url}' as source_url
        from dataframe"""

In [None]:
output = spark.sql(sql_script)

In [None]:
output.show(3)

In [None]:
# Update to your GCS bucket
gcs_bucket = f'amazon_reviews_bucket'

gcs_filepath = f'gs://amazon_reviews_bucket'

In [None]:
output.write \
  .format("bigquery") \
  .option("temporaryGcsBucket",gcs_bucket) \
  .mode("append") \
  .save("amazon_reviews.categoryFilesSmall")

In [None]:
# output.write.partitionBy("product_ID").csv(gcs_filepath)

In [None]:
# output.write \
#   .mode('overwrite') \
#   .csv(gcs_filepath)

In [None]:
import wget

In [None]:
wget.download(url)

In [None]:
import gzip
import shutil
with gzip.open(file, 'rb') as f_in:
    with open('.'.join(file.split('.')[0:2]), 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
file

In [None]:
'.'.join(file.split('.')[0:2])

In [None]:
df = spark.read.json("file://"+SparkFiles.get(file),schema)

In [None]:
schema = StructType([ 
    StructField("overall",IntegerType(),True), 
    StructField("vote",IntegerType(),True), 
    StructField("verified",BooleanType(),True)
  ])

In [None]:
df.show()