In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!java -version

openjdk version "11.0.19" 2023-04-18
OpenJDK Runtime Environment (build 11.0.19+7-post-Ubuntu-0ubuntu122.04.1)
OpenJDK 64-Bit Server VM (build 11.0.19+7-post-Ubuntu-0ubuntu122.04.1, mixed mode, sharing)


In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=16af88cee02575e96837abbdaa68c383cada801f86741a71089722ab5a1a7ea9
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [4]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [5]:
import findspark
findspark.init()
findspark.find()

'/usr/local/lib/python3.10/dist-packages/pyspark'

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
emptyRDD = spark.sparkContext.emptyRDD()
print(emptyRDD)

EmptyRDD[0] at emptyRDD at NativeMethodAccessorImpl.java:0


In [7]:
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
columns = ["language", "users_count"]
df = spark.createDataFrame(data).toDF(*columns)
df.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [8]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [9]:
file_path = "/content/drive/MyDrive/hackrx/"

In [10]:
from pyspark.sql import functions
from pyspark.sql import types

In [11]:
categories = spark.read.option("header",True).csv(file_path+'categories.csv')
info = spark.read.option("header",True).csv(file_path+'info.csv')
reviews = spark.read.option("header",True).csv(file_path+'reviews.csv')
similar = spark.read.option("header",True).csv(file_path+'similar.csv')

In [12]:
print(type(reviews))

<class 'pyspark.sql.dataframe.DataFrame'>


###User Charachteristics

Doing on small dataset containing 10000 items

In [None]:
"""
User charachteristics could contain:
  Customer_id
  group of purchase
  Voting Pattern
    Total votes on reviews
    Total helpful votes on reviews
"""


'\nUser charachteristics could contain:\n  Customer_id\n  group of purchase\n  Voting Pattern\n    Total votes on reviews\n    Total helpful votes on reviews\n'

In [13]:
reviews_red = reviews.limit(10000)
info_red = info.limit(10000)
cat_red = categories.limit(10000)
print(f"Reviews DF size {reviews_red.count()} info DF size { info_red.count()} category Df size {cat_red.count()}")


Reviews DF size 10000 info DF size 10000 category Df size 10000


In [14]:
cat_red = cat_red.withColumn("category", functions.split(functions.regexp_replace(functions.col("category"), r"\[.*?\]", ""), "\\|"))
cat_red.show(truncate=False)

+----------+--------------------------------------------------------------------------------------------------+
|ASIN      |category                                                                                          |
+----------+--------------------------------------------------------------------------------------------------+
|0827229534|[, Books, Subjects, Religion & Spirituality, Christianity, Clergy, Preaching]                     |
|0827229534|[, Books, Subjects, Religion & Spirituality, Christianity, Clergy, Sermons]                       |
|0738700797|[, Books, Subjects, Religion & Spirituality, Earth-Based Religions, Wicca]                        |
|0738700797|[, Books, Subjects, Religion & Spirituality, Earth-Based Religions, Witchcraft]                   |
|0486287785|[, Books, Subjects, Home & Garden, Crafts & Hobbies, General]                                     |
|0842328327|[, Books, Subjects, Religion & Spirituality, Christianity, Reference, Commentaries, New Test

In [15]:
def flatten_category(arr):
  return list(set(x.strip() for sublist in arr for x in sublist if x.strip()))

In [16]:
flattenUDF = functions.udf(lambda z: flatten_category(z), types.ArrayType(types.StringType()))

In [17]:
cat_red = cat_red.groupBy('ASIN').agg(functions.collect_set("category").alias("category"))
cat_red = cat_red.withColumn('category', flattenUDF(functions.col("category")))
cat_red.show(truncate=False)

+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ASIN      |category                                                                                                                                                                                                                                              |
+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0001474103|[Christianity, Books, Religion & Spirituality, Prophecies, Subjects, Reference]                                                                                                                                 

In [18]:
reviews_red.show()

+----------+--------------+------+-----+-------+
|      ASIN|   customer id|rating|votes|helpful|
+----------+--------------+------+-----+-------+
|0827229534|A2JW67OY8U6HHK|     5|   10|      9|
|0827229534|A2VE83MZF98ITY|     5|    6|      5|
|0738700797|A11NCO6YTE4BTJ|     5|    5|      4|
|0738700797| A9CQ3PLRNIR83|     4|    5|      5|
|0738700797|A13SG9ACZ9O5IM|     5|    8|      8|
|0738700797|A1BDAI6VEYMAZA|     5|    4|      4|
|0738700797|A2P6KAWXJ16234|     4|   16|     16|
|0738700797| AMACWC3M7PQFR|     4|    5|      5|
|0738700797|A3GO7UV9XX14D8|     4|    6|      6|
|0738700797|A1GIL64QK68WKL|     5|    8|      8|
|0738700797| AEOBOF2ONQJWV|     5|    8|      5|
|0738700797|A3IGHTES8ME05L|     5|    5|      5|
|0738700797|A1CP26N8RHYVVO|     1|   13|      9|
|0738700797| ANEIANH0WAT9D|     5|    1|      1|
|0486287785|A3IDGASRQAW8B2|     5|    2|      2|
|0842328327|A2591BUPXCS705|     4|    1|      1|
|0486220125| ATVPDKIKX0DER|     5|   12|     11|
|0486220125| AUEZ7NV

In [19]:
reviews_red = reviews_red.drop("rating")

In [20]:
reviews_red.columns

['ASIN', 'customer id', 'votes', 'helpful']

In [21]:
reviews_red.show()

+----------+--------------+-----+-------+
|      ASIN|   customer id|votes|helpful|
+----------+--------------+-----+-------+
|0827229534|A2JW67OY8U6HHK|   10|      9|
|0827229534|A2VE83MZF98ITY|    6|      5|
|0738700797|A11NCO6YTE4BTJ|    5|      4|
|0738700797| A9CQ3PLRNIR83|    5|      5|
|0738700797|A13SG9ACZ9O5IM|    8|      8|
|0738700797|A1BDAI6VEYMAZA|    4|      4|
|0738700797|A2P6KAWXJ16234|   16|     16|
|0738700797| AMACWC3M7PQFR|    5|      5|
|0738700797|A3GO7UV9XX14D8|    6|      6|
|0738700797|A1GIL64QK68WKL|    8|      8|
|0738700797| AEOBOF2ONQJWV|    8|      5|
|0738700797|A3IGHTES8ME05L|    5|      5|
|0738700797|A1CP26N8RHYVVO|   13|      9|
|0738700797| ANEIANH0WAT9D|    1|      1|
|0486287785|A3IDGASRQAW8B2|    2|      2|
|0842328327|A2591BUPXCS705|    1|      1|
|0486220125| ATVPDKIKX0DER|   12|     11|
|0486220125| AUEZ7NVOEHYRY|   13|     12|
|0486220125| ATVPDKIKX0DER|   18|      7|
|0486220125| AJYG6ZJUQPZ9M|   14|     10|
+----------+--------------+-----+-

In [22]:
user_char = reviews_red.join(info_red,on="ASIN",how="inner")
user_char = user_char.join(cat_red, on="ASIN", how="inner")
user_char.show()

+----------+--------------+-----+-------+--------------------+-----+---------+--------------------+
|      ASIN|   customer id|votes|helpful|               title|group|salesrank|            category|
+----------+--------------+-----+-------+--------------------+-----+---------+--------------------+
|0827229534|A2JW67OY8U6HHK|   10|      9|Patterns of Preac...| Book|   396585|[Preaching, Chris...|
|0827229534|A2VE83MZF98ITY|    6|      5|Patterns of Preac...| Book|   396585|[Preaching, Chris...|
|0738700797|A11NCO6YTE4BTJ|    5|      4|Candlemas: Feast ...| Book|   168596|[Wicca, Books, Re...|
|0738700797| A9CQ3PLRNIR83|    5|      5|Candlemas: Feast ...| Book|   168596|[Wicca, Books, Re...|
|0738700797|A13SG9ACZ9O5IM|    8|      8|Candlemas: Feast ...| Book|   168596|[Wicca, Books, Re...|
|0738700797|A1BDAI6VEYMAZA|    4|      4|Candlemas: Feast ...| Book|   168596|[Wicca, Books, Re...|
|0738700797|A2P6KAWXJ16234|   16|     16|Candlemas: Feast ...| Book|   168596|[Wicca, Books, Re...|


In [23]:
user_char.count()

10000

In [24]:
user_char = user_char.drop("ASIN","title","salesrank")
user_char.show()

+--------------+-----+-------+-----+--------------------+
|   customer id|votes|helpful|group|            category|
+--------------+-----+-------+-----+--------------------+
|A2JW67OY8U6HHK|   10|      9| Book|[Preaching, Chris...|
|A2VE83MZF98ITY|    6|      5| Book|[Preaching, Chris...|
|A11NCO6YTE4BTJ|    5|      4| Book|[Wicca, Books, Re...|
| A9CQ3PLRNIR83|    5|      5| Book|[Wicca, Books, Re...|
|A13SG9ACZ9O5IM|    8|      8| Book|[Wicca, Books, Re...|
|A1BDAI6VEYMAZA|    4|      4| Book|[Wicca, Books, Re...|
|A2P6KAWXJ16234|   16|     16| Book|[Wicca, Books, Re...|
| AMACWC3M7PQFR|    5|      5| Book|[Wicca, Books, Re...|
|A3GO7UV9XX14D8|    6|      6| Book|[Wicca, Books, Re...|
|A1GIL64QK68WKL|    8|      8| Book|[Wicca, Books, Re...|
| AEOBOF2ONQJWV|    8|      5| Book|[Wicca, Books, Re...|
|A3IGHTES8ME05L|    5|      5| Book|[Wicca, Books, Re...|
|A1CP26N8RHYVVO|   13|      9| Book|[Wicca, Books, Re...|
| ANEIANH0WAT9D|    1|      1| Book|[Wicca, Books, Re...|
|A3IDGASRQAW8B

In [25]:
user_char.select("customer id").distinct().count()

8299

In [26]:
user_char = user_char.groupBy("customer id").agg(functions.avg("votes").alias("avg_votes"),functions.avg("helpful").alias("avg_helpful_votes"),functions.collect_set("group").alias("group_set"),functions.collect_set("category").alias("category_set"))
user_char.show(truncate=False)

+--------------+---------+-----------------+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|customer id   |avg_votes|avg_helpful_votes|group_set|category_set                                                                                                                                                                                                                              

In [27]:
user_char = user_char.withColumn('category_set', flattenUDF(functions.col("category_set")))
user_char.show(truncate=False)

+--------------+---------+-----------------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|customer id   |avg_votes|avg_helpful_votes|group_set|category_set                                                                                                                                                                                                                                

In [28]:
user_char.count()

8299

###Item Charachteristics

Doing on small dataset containing 10000 items

In [None]:
"""
Item charachteristics could contain:
  ASIN
  Title ??
  Group
  Sales Rank
  Similar Items ??
  Categories
  Avg Rating

PS: I am not sure whether title should be kept or not since ASIN should be enough. Not req then drop it
    I am also not sure how similar item can be encoporated in this, so i am just adding it as a list
"""

'\nItem charachteristics could contain:\n  ASIN\n  Title ??\n  Group\n  Sales Rank\n  Similar Items ??\n  Categories\n  Avg Rating\n\nPS: I am not sure whether title should be kept or not since ASIN should be enough. Not req then drop it\n    I am also not sure how similar item can be encoporated in this, so i am just adding it as a list\n'

In [29]:
sim_red = similar.alias("sim_red")
reviews_red = reviews.alias("reviews_red")
cat_red = categories.alias("cat_red")

In [30]:
cat_red = cat_red.withColumn("category", functions.split(functions.regexp_replace(functions.col("category"), r"\[.*?\]", ""), "\\|"))
cat_red.show(truncate=False)

+----------+--------------------------------------------------------------------------------------------------+
|ASIN      |category                                                                                          |
+----------+--------------------------------------------------------------------------------------------------+
|0827229534|[, Books, Subjects, Religion & Spirituality, Christianity, Clergy, Preaching]                     |
|0827229534|[, Books, Subjects, Religion & Spirituality, Christianity, Clergy, Sermons]                       |
|0738700797|[, Books, Subjects, Religion & Spirituality, Earth-Based Religions, Wicca]                        |
|0738700797|[, Books, Subjects, Religion & Spirituality, Earth-Based Religions, Witchcraft]                   |
|0486287785|[, Books, Subjects, Home & Garden, Crafts & Hobbies, General]                                     |
|0842328327|[, Books, Subjects, Religion & Spirituality, Christianity, Reference, Commentaries, New Test

In [31]:
cat_red = cat_red.groupBy('ASIN').agg(functions.collect_set("category").alias("category"))
cat_red = cat_red.withColumn('category', flattenUDF(functions.col("category")))
cat_red.show(truncate=False)

+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ASIN      |category                                                                                                                                                                                                             |
+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0001057170|[Humor, Entertainment, Books, Formats, General, Boxed Sets, Books on Tape, Subjects]                                                                                                                                 |
|0001474103|[Christianity, Books, Religion & Spirituality, Prophecies, Subjects, Reference] 

In [32]:
sim_red.show()

+----------+----------+
|      ASIN|   ASIN ID|
+----------+----------+
|0827229534|0804215715|
|0827229534|156101074X|
|0827229534|0687023955|
|0827229534|0687074231|
|0827229534|082721619X|
|0738700797|0738700827|
|0738700797|1567184960|
|0738700797|1567182836|
|0738700797|0738700525|
|0738700797|0738700940|
|0842328327|0842328130|
|0842328327|0830818138|
|0842328327|0842330313|
|0842328327|0842328610|
|0842328327|0842328572|
|1577943082|157794349X|
|1577943082|0892749504|
|1577943082|1577941829|
|1577943082|0892749563|
|1577943082|1577946006|
+----------+----------+
only showing top 20 rows



In [33]:
sim_red = sim_red.groupBy("ASIN").agg(functions.collect_list("ASIN ID").alias("similar_set"))
sim_red.show()
sim_red.count()

+----------+--------------------+
|      ASIN|         similar_set|
+----------+--------------------+
|0001057170|[B00006JMQR, 1585...|
|0001474103|[0895403889, 0001...|
|0002250985|[0912365307, 0060...|
|0002251965|[0002553481, 0002...|
|0004722817|[0007127596, 0004...|
|0004724534|[0007127596, 0007...|
|0006386709|[1857883195, 0060...|
|0006482848|[1400030137, 1400...|
|0006482864|[1400030080, 1400...|
|0006491790|[0865475091, 0151...|
|0006531806|[0030635543, 1891...|
|0007105509|[0743427084, 1573...|
|0007116012|[1573221112, 0743...|
|0007116993|[014219624X, 0722...|
|0007117000|[0062513605, 1577...|
|0007127952|[0399144374, 0425...|
|0007131151|[0722535724, 1567...|
|0007140444|[0285633791, 0960...|
|0007142595|[0261103881, 0007...|
|002011690X|[0307605817, 0943...|
+----------+--------------------+
only showing top 20 rows



63780

In [34]:
info_red.show()

+----------+--------------------+-----+---------+
|      ASIN|               title|group|salesrank|
+----------+--------------------+-----+---------+
|0827229534|Patterns of Preac...| Book|   396585|
|0738700797|Candlemas: Feast ...| Book|   168596|
|0486287785|World War II Alli...| Book|  1270652|
|0842328327|Life Application ...| Book|   631289|
|1577943082|Prayers That Avai...| Book|   455160|
|0486220125|How the Other Hal...| Book|   188784|
|B00000AU3R|               Batik|Music|     5392|
|0231118597| Losing Matt Shepard| Book|   277409|
|1859677800|Making Bread: The...| Book|   949166|
|0375709363|The Edward Said R...| Book|   220379|
|0871318237|Resetting the Clo...| Book|   412962|
|1590770218|Fantastic Food wi...| Book|    24741|
|0313230269|Clockwork Worlds ...| Book|  2895088|
|B00004W1W1|            Later...|Music|   390624|
|1559362022|Wake Up and Smell...| Book|   518927|
|0195110382|War at Sea: A Nav...| Book|   631564|
|0849311012|Telecommunication...| Book|   570183|


In [35]:
reviews_red.show()

+----------+--------------+------+-----+-------+
|      ASIN|   customer id|rating|votes|helpful|
+----------+--------------+------+-----+-------+
|0827229534|A2JW67OY8U6HHK|     5|   10|      9|
|0827229534|A2VE83MZF98ITY|     5|    6|      5|
|0738700797|A11NCO6YTE4BTJ|     5|    5|      4|
|0738700797| A9CQ3PLRNIR83|     4|    5|      5|
|0738700797|A13SG9ACZ9O5IM|     5|    8|      8|
|0738700797|A1BDAI6VEYMAZA|     5|    4|      4|
|0738700797|A2P6KAWXJ16234|     4|   16|     16|
|0738700797| AMACWC3M7PQFR|     4|    5|      5|
|0738700797|A3GO7UV9XX14D8|     4|    6|      6|
|0738700797|A1GIL64QK68WKL|     5|    8|      8|
|0738700797| AEOBOF2ONQJWV|     5|    8|      5|
|0738700797|A3IGHTES8ME05L|     5|    5|      5|
|0738700797|A1CP26N8RHYVVO|     1|   13|      9|
|0738700797| ANEIANH0WAT9D|     5|    1|      1|
|0486287785|A3IDGASRQAW8B2|     5|    2|      2|
|0842328327|A2591BUPXCS705|     4|    1|      1|
|0486220125| ATVPDKIKX0DER|     5|   12|     11|
|0486220125| AUEZ7NV

In [36]:
reviews_red = reviews_red.drop("customer id", "votes", "helpful")
reviews_red.show()

+----------+------+
|      ASIN|rating|
+----------+------+
|0827229534|     5|
|0827229534|     5|
|0738700797|     5|
|0738700797|     4|
|0738700797|     5|
|0738700797|     5|
|0738700797|     4|
|0738700797|     4|
|0738700797|     4|
|0738700797|     5|
|0738700797|     5|
|0738700797|     5|
|0738700797|     1|
|0738700797|     5|
|0486287785|     5|
|0842328327|     4|
|0486220125|     5|
|0486220125|     5|
|0486220125|     5|
|0486220125|     4|
+----------+------+
only showing top 20 rows



In [37]:
reviews_red = reviews_red.groupBy("ASIN").agg(functions.avg("rating").alias("avg_rating"))
reviews_red.show()

+----------+------------------+
|      ASIN|        avg_rating|
+----------+------------------+
|B00000IBNZ|               4.0|
|0425182681|3.4545454545454546|
|B00005YTRK|               5.0|
|0805044108|               5.0|
|092997929X|               5.0|
|0970430353|             4.625|
|1564145743|3.8333333333333335|
|0531164659|               5.0|
|0071346503|               4.2|
|B00000JMXK|               5.0|
|1575846578|               5.0|
|0140447881|               3.5|
|0912912065| 4.511111111111111|
|0786865059|3.8106796116504853|
|050051044X|               3.5|
|B00003CK6Y|              4.75|
|B000053VCO|               4.0|
|B00005T30X|             1.875|
|B000000PEZ|               5.0|
|B000059HUN| 4.666666666666667|
+----------+------------------+
only showing top 20 rows



In [38]:
reviews_red.count()

66991

In [39]:
item_char = reviews_red.join(info_red,on="ASIN",how="inner")
item_char.show()
item_char.count()

+----------+------------------+--------------------+-----+---------+
|      ASIN|        avg_rating|               title|group|salesrank|
+----------+------------------+--------------------+-----+---------+
|B00000IBNZ|               4.0|Clamp School Dete...|Video|    73557|
|0425182681|3.4545454545454546|    The Keys of Hell| Book|   318321|
|B00005YTRK|               5.0|           Ceol More|Music|    59782|
|0805044108|               5.0|Ev'Ry Time I Feel...| Book|    91659|
|092997929X|               5.0|Waldorf Schools: ...| Book|   688285|
|0970430353|             4.625|Illustration For ...| Book|   533414|
|1564145743|3.8333333333333335|Tell Me What to E...| Book|    31994|
|0531164659|               5.0|High School Hazin...| Book|   431993|
|0071346503|               4.2|Streetsmart Guide...| Book|   272062|
|B00000JMXK|               5.0|  I Need Your Spirit|Music|   136156|
|1575846578|               5.0|Tyler is Shy (Fis...| Book|   397646|
|0140447881|               3.5|Beo

7460

In [40]:
item_char.filter(item_char["ASIN"]== "3895780812").show()

+----------+----------+--------------------+-----+---------+
|      ASIN|avg_rating|               title|group|salesrank|
+----------+----------+--------------------+-----+---------+
|3895780812|       5.0|Computed Tomograp...| Book|   179448|
+----------+----------+--------------------+-----+---------+



In [41]:
cat_red.show()

+----------+--------------------+
|      ASIN|            category|
+----------+--------------------+
|0001057170|[Humor, Entertain...|
|0001474103|[Christianity, Bo...|
|0002193183|[Biological Scien...|
|0002250985|[Food & Wine, Bru...|
|0002251965|[U.S. Regional, F...|
|0004133897|[Arts & Photograp...|
|0004140885|[Food & Wine, Boo...|
|0004707508|[English (All), I...|
|0004722817|[Conventional, Wo...|
|0004723015|[Individual Sport...|
|0004723996|[Travel Books, Gr...|
|0004724534|[Antiques & Colle...|
|000472500X|[Arts & Photograp...|
|0005993806|[Christianity, Ch...|
|0006386709|[Business & Inves...|
|0006482864|[Dick, ( D ), Sci...|
|0006491790|[Water Sports, Su...|
|0006498779|[Mystery, Mystery...|
|0006531806|[Strategy, Politi...|
|0006907237|[Fiction, Science...|
+----------+--------------------+
only showing top 20 rows



In [42]:
item_char = item_char.join(cat_red,on="ASIN",how="inner")
item_char.show(truncate=False)
item_char.count()

+----------+-----------------+-----------------------------------------------------------------------------+-----+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ASIN      |avg_rating       |title                                                                        |group|salesrank|category                                                                                                                                                                                                                     |
+----------+-----------------+-----------------------------------------------------------------------------+-----+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

7460

In [43]:
item_char = item_char.join(sim_red,on="ASIN",how="inner")
item_char.show(truncate=False)
item_char.count()

+----------+-----------------+-----------------------------------------------------------------------+-----+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------+
|ASIN      |avg_rating       |title                                                                  |group|salesrank|category                                                                                                                                                                                                                     |similar_set                                                 |
+----------+-----------------+-----------------------------------------------------------------------+-----+---------+------------------------------------------------------------------------------

6046

###Graph Embedding

In [44]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstalling networkx-3.1:
      Successfully uninstalled networkx-3.1
Successfully installed networkx-2.8.8 node2vec-0.4.6


In [45]:
user_dat = user_char.drop('avg_votes','avg_helpful_votes','group_set',)
# user_dat.show(truncate=False)

In [46]:
item_dat = item_char.drop('avg_rating','group','salesrank')
# item_dat.show(truncate=False)

In [47]:
user_characteristics = user_dat.rdd.map(lambda row: (row['customer id'], row['category_set'])).collectAsMap()
print(user_characteristics)


{'A100DMMBKT8HO5': ['Humanities', 'United States', 'New & Used Textbooks', 'Aviation', 'Books', 'General', 'Specialty Stores', 'Americas', 'History', 'Military', 'Subjects'], 'A101OA1WQ56H5I': ['Alternative Rock', 'Rock', 'Alternative Metal', 'Hardcore & Punk', 'Styles', 'Hard Rock & Heavy Metal', 'Formats', 'Hard Rock & Metal', 'General', 'Alternative Styles', 'Grunge', 'Box Sets', 'Music'], 'A101VYVQR1Y70U': ['Rock', 'Alternative Metal', 'Styles', 'Hard Rock & Metal', 'General', 'Music'], 'A103U0Q3IKSXHE': ['Outdoors & Nature', 'Christianity', 'Ecology', 'Inspirational', 'Environment', 'Books', 'General', 'Religion & Spirituality', 'Spirituality', 'Subjects', 'Catholicism'], 'A104W9PL1OO5JN': ['Dan', 'Mystery', 'Authors', 'Mystery & Thrillers', 'Books', '( B )', 'General', 'Book Clubs', 'Specialty Stores', 'Brown', 'Thrillers', 'A-Z', 'Subjects'], 'A1058UNGQCR4F1': ['Latin American', 'Books', 'General', 'Literary', 'Literature & Fiction', 'Classics', 'Subjects'], 'A105B8SVTVDBHU': ['

In [48]:
item_characteristics = item_dat.rdd.map(lambda row: (row['ASIN'], row['category'])).collectAsMap()
print(item_characteristics)

{'0001474103': ['Christianity', 'Books', 'Religion & Spirituality', 'Prophecies', 'Subjects', 'Reference'], '0004724534': ['Antiques & Collectibles', 'Firearms & Weapons', 'Home & Garden', 'Books', 'General', 'Subjects', 'Reference'], '0006386709': ['Business & Investing Books', 'Management & Leadership', 'Amazon.com Stores', 'Books', 'Business & Investing', 'General', 'Management', 'Home & Office', 'Subjects'], '0023734809': ['Christianity', 'Bible & Other Sacred Texts', 'Language Studies', 'Books', 'New Testament', 'Bible', 'Religion & Spirituality', 'Subjects', 'Reference'], '0027348407': ['Literature', 'Science Fiction', 'Ages 9-12', 'Books', '& Magic', 'General', 'Mystery & Horror', "Children's Books", 'Fantasy', 'Subjects'], '0028612779': ['Parenting & Families', 'Parenting', 'Music & Videos for Parents', 'Teenagers', 'Sociology', 'Nonfiction', 'Marriage & Family', 'Books', 'Social Sciences', 'General', 'Music & More', 'Specialty Stores', 'Parenting Books', 'Subjects'], '00286172

In [49]:
reviews_red = reviews.limit(10000)

In [51]:
edge_table = reviews_red.drop("rating","votes","helpful")

In [None]:
# edge_table.show()

In [52]:
import networkx as nx

# Create a new bipartite graph
graph = nx.Graph()

# Add users as nodes in one set
graph.add_nodes_from(user_characteristics.keys(), bipartite=0)

# Add items as nodes in the other set
graph.add_nodes_from(item_characteristics.keys(), bipartite=1)

# Add edges between users and items based on shared characteristics
# for user, user_char in user_characteristics.items():
#     for item, item_char in item_characteristics.items():
#         if any(char in user_char for char in item_char):
#             graph.add_edge(user, item)
for row in edge_table.collect():
    customer_id = row['customer id']
    ASIN = row['ASIN']
    graph.add_edge(customer_id,ASIN)

In [None]:
# nx.draw(graph,with_label=True)

In [None]:
count = 0
for n,d in list(graph.nodes(data=True)):
  if len(d) == 0:
    print(n)
    count+=1

print(count)

In [53]:
import networkx as nx

# Create a new bipartite graph
graph = nx.Graph()

# Add users as nodes in one set
# graph.add_nodes_from(user_characteristics.keys(), bipartite=0)
for user, features in user_characteristics.items():
    graph.add_node(user, bipartite=0, features=features)
# Add items as nodes in the other set
# graph.add_nodes_from(item_characteristics.keys(), bipartite=1,)
for item, features in item_characteristics.items():
    graph.add_node(item, bipartite=1, features=features)
# Add edges between users and items based on shared characteristics
# for user, user_char in user_characteristics.items():
#     for item, item_char in item_characteristics.items():
#         if any(char in user_char for char in item_char):
#             graph.add_edge(user, item)
for row in edge_table.collect():
    customer_id = row['customer id']
    ASIN = row['ASIN']
    graph.add_edge(customer_id,ASIN)

# Access nodes and edges
user_nodes = {n for n, d in graph.nodes(data=True) if len(d)!=0 and d['bipartite'] == 0}
item_nodes = {n for n, d in graph.nodes(data=True) if len(d)!=0 and d['bipartite'] == 1}
edges = graph.edges()

# Perform node embedding
# Node embedding is a way to represent nodes in a low-dimensional vector space
# There are several algorithms available for node embedding, such as node2vec, DeepWalk, or GraphSAGE.
# Here's an example of using the node2vec algorithm from the `node2vec` library in Python:


# Now you have the node embeddings for users and items, which you can use for various downstream tasks.

# Example usage:
# You can calculate the similarity between two nodes based on their embeddings
# user1_embedding = user_embeddings["user1"]
# item1_embedding = item_embeddings["item1"]
# similarity = user1_embedding.dot(item1_embedding)

In [55]:
from node2vec import Node2Vec

# Create the Node2Vec object
node2vec = Node2Vec(graph, dimensions=64, walk_length=10, num_walks=20, workers=4)

# Generate the node embeddings
model = node2vec.fit(window=10, min_count=1)

# Get the embedding vectors for users and items
user_embeddings = {node: model.wv[node] for node in user_nodes}
item_embeddings = {node: model.wv[node] for node in item_nodes}


Computing transition probabilities:   0%|          | 0/14420 [00:00<?, ?it/s]

In [None]:
# nx.draw(graph,with_labels=True, font_weight='bold')

In [59]:
nodes_data = graph.nodes(data=True)

In [66]:
model.save("/content/drive/MyDrive/node2vec_for_bipartite.pkl")

In [79]:
import pickle
with open('/content/drive/MyDrive/node2vec_for_bipartite.pkl', 'rb') as f:
    embeddings = pickle.load(f)

user_check = embeddings.wv['A100DMMBKT8HO5']


In [80]:
item_check = embeddings.wv['0001474103']

In [81]:
user_check.dot(item_check)

0.10362223

In [84]:
graph.nodes['A100DMMBKT8HO5']

{'bipartite': 0,
 'features': ['Humanities',
  'United States',
  'New & Used Textbooks',
  'Aviation',
  'Books',
  'General',
  'Specialty Stores',
  'Americas',
  'History',
  'Military',
  'Subjects']}

In [85]:
graph.nodes['0001474103']

{'bipartite': 1,
 'features': ['Christianity',
  'Books',
  'Religion & Spirituality',
  'Prophecies',
  'Subjects',
  'Reference']}