In [1]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:10 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [607 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:12 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 P

In [2]:
# Start a SparkSession
import findspark
findspark.init()

In [3]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TF-IDF").getOrCreate()

In [5]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Music_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
music_df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep="\t", header=True, inferSchema=True)
music_df.show(10)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   10140119|R3LI5TRP3YIDQL|B00TXH4OLC|     384427924|Whatever's for Us...|           Music|          5|            0|          0|   N|                Y|          Five Stars|Love this CD alon...| 2015-08-31|
|         US|   27664622|R3LGC3EKEG84PX|B00B6QXN6U|     831769051|Same Trailer Diff...|           Music|          5|    

In [12]:
# Filter Total Votes
vine_df = music_df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase"])

votes_df = vine_df.filter(music_df["total_votes"] >= 20)
votes_df.show(10)

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R2SHXRL6SL1GC9|          3|           25|         26|   N|                Y|
|R2ZC033X86YOY8|          5|           25|         26|   N|                N|
|R2736RJGCOSL80|          5|           19|         20|   N|                Y|
| RRY5DJ6J9BKAX|          5|           19|         21|   N|                Y|
|R2P4PJJ2ROTPBM|          5|           46|         48|   N|                N|
| RO8RAEGB66BKR|          4|           46|         46|   N|                N|
| RRFZ7QZTRJC59|          5|          292|        300|   N|                N|
| RFN4PNRUD1UW2|          4|           21|         22|   N|                N|
| RO7EBJEP7IHIX|          5|           26|         26|   N|                N|
|R1CVS4MK9RTNNP|          2|           11|         22|   N|     

In [14]:
# Filter: >50%
total_df = votes_df.filter(votes_df["helpful_votes"]/votes_df["total_votes"] >= 0.5)
total_df.show(10)

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R2SHXRL6SL1GC9|          3|           25|         26|   N|                Y|
|R2ZC033X86YOY8|          5|           25|         26|   N|                N|
|R2736RJGCOSL80|          5|           19|         20|   N|                Y|
| RRY5DJ6J9BKAX|          5|           19|         21|   N|                Y|
|R2P4PJJ2ROTPBM|          5|           46|         48|   N|                N|
| RO8RAEGB66BKR|          4|           46|         46|   N|                N|
| RRFZ7QZTRJC59|          5|          292|        300|   N|                N|
| RFN4PNRUD1UW2|          4|           21|         22|   N|                N|
| RO7EBJEP7IHIX|          5|           26|         26|   N|                N|
|R1CVS4MK9RTNNP|          2|           11|         22|   N|     

In [15]:
# Paid Vine program
from pyspark.sql.functions import col, avg
paid_df = total_df.filter(total_df['vine']== 'Y')
paid_df.describe().show()

+-------+--------------+------------------+------------------+------------------+----+-----------------+
|summary|     review_id|       star_rating|     helpful_votes|       total_votes|vine|verified_purchase|
+-------+--------------+------------------+------------------+------------------+----+-----------------+
|  count|             7|                 7|                 7|                 7|   7|                7|
|   mean|          null|3.5714285714285716|26.857142857142858|31.714285714285715|null|             null|
| stddev|          null|0.5345224838248488|13.582201238245238|14.032275720807439|null|             null|
|    min|R1482JAU1ZR7QH|                 3|                15|                21|   Y|                N|
|    max| RXGU9DSKZJSP0|                 4|                54|                58|   Y|                N|
+-------+--------------+------------------+------------------+------------------+----+-----------------+



In [16]:
# Unpaid Non-Vine program
from pyspark.sql.functions import col, avg
unpaid_df = total_df.filter(total_df['vine']== 'N')
unpaid_df.describe().show()

+-------+--------------+------------------+------------------+------------------+------+-----------------+
|summary|     review_id|       star_rating|     helpful_votes|       total_votes|  vine|verified_purchase|
+-------+--------------+------------------+------------------+------------------+------+-----------------+
|  count|        105979|            105979|            105979|            105979|105979|           105979|
|   mean|          null| 4.203021353286972| 34.48632276205663|38.640674095811434|  null|             null|
| stddev|          null|1.2847072310247238|35.853572156291975|38.399905374092576|  null|             null|
|    min|R1000IP7OT9CZE|                 1|                10|                20|     N|                N|
|    max| RZZZY0N4QNCXE|                 5|              2181|              2246|     N|                Y|
+-------+--------------+------------------+------------------+------------------+------+-----------------+



In [17]:
# Paid - 5-star reviews of Vine program
paid5star = paid_df[paid_df['star_rating']== 5].count()
paid_number = paid_df.count()
percentage5star = float(paid5star) / float(paid_number)
print(paid_number)
print(paid5star)
print(percentage5star)

7
0
0.0


In [18]:
# Unpaid - 5-star reviews of Non- Vine program
unpaid5star = unpaid_df[unpaid_df['star_rating']== 5].count()
unpaid_number = unpaid_df.count()
non_percentage5star = float(unpaid5star) / float(unpaid_number)
print(unpaid_number)
print(unpaid5star)
print(non_percentage5star)

105979
67580
0.6376735013540419
