In [10]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import psycopg2

## Spark Session configuration for both <font color="yellow">PostgresSQL DB and MongoDB

In [15]:
my_spark = SparkSession \
    .builder \
    .appName("restaurant") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .config("spark.jars", "/Users/deependrashekhawat/jars/postgresql-42.2.21.jar") \
    .getOrCreate()

### Library to read of data from <font color="yellow">PostgresSQL DB

In [16]:
conn = psycopg2.connect(host="localhost", database="testrestaurant", user="postgres", password="Welcome@1", port=5436)
curr = conn.cursor()

In [67]:
curr.execute("""
select rs.restaurant_id, restaurant_name, street, city, state, postal_code, latitude, longitude, stars, review_count, cuisine_name
from restaurantcuisine rs
join restaurants r ON (rs.restaurant_id = r.restaurant_id)
join address a ON (r.address_id = a.address_id)
join cuisines c ON (rs.cuisine_id = c.cuisine_id)
""")
resultCity = curr.fetchall()

In [104]:
columns = ["restaurant_id", "restaurant_name", "street", "city", "state", "postal_code", "latitude", "longitude", "stars", "review_count", "cuisine_name"]
dfFromList = my_spark.createDataFrame(data=resultCity, schema = columns)

In [105]:
# dfFromList.registerTempTable("restaurant")

In [103]:
# output = my_spark.sql(f"select * from restaurant where cuisine_name in ({a})")

In [236]:
a = ['Thai', 'Indian', 'Mexican']
k = {}
for i in a:
    k[i] = dfFromList.filter(F.col('cuisine_name').isin(i)).filter(F.col("city") == 'Boston').orderBy(['review_count', 'stars'],ascending=False).toJSON().take(10)

In [237]:
k["Indian"]

['{"restaurant_id":56858,"restaurant_name":"India Quality Restaurant","street":"484 Commonwealth Ave","city":"Boston","state":"MA","postal_code":"02215","latitude":42.348564970000000000,"longitude":-71.094380840000000000,"stars":4.000000000000000000,"review_count":756,"cuisine_name":"Indian"}',
 '{"restaurant_id":60173,"restaurant_name":"Ristorante Fiore","street":"250 Hanover St","city":"Boston","state":"MA","postal_code":"02113","latitude":42.363560400000000000,"longitude":-71.055116000000000000,"stars":3.500000000000000000,"review_count":625,"cuisine_name":"Indian"}',
 '{"restaurant_id":61768,"restaurant_name":"Mela","street":"578 Tremont St","city":"Boston","state":"MA","postal_code":"02118","latitude":42.343297400000000000,"longitude":-71.072456600000000000,"stars":3.500000000000000000,"review_count":621,"cuisine_name":"Indian"}',
 '{"restaurant_id":60352,"restaurant_name":"Kashmir Indian Restaurant","street":"279 Newbury St, Ste 2","city":"Boston","state":"MA","postal_code":"0211

# -----------------------------

### Reading From <font color="yellow">MongoDB

In [286]:
dfUser = my_spark.read \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .option("spark.mongodb.input.uri", "mongodb://localhost:27017/hungryApp.user") \
    .load()

# ----------------------

In [287]:
dfUser.show(truncate=False)

+--------------------------+---------+------------+----------+---------+---------+-----------------------+-----+
|_id                       |city     |contact     |email     |firstname|lastname |preference             |state|
+--------------------------+---------+------------+----------+---------+---------+-----------------------+-----+
|{61901edc8d57a518b36728a4}|cambridge|7.74641373E9|deep@s.com|Deep     |shekhawat|[thai, indian, mexican]|MA   |
|{619197026a85243051943d49}|boston   |7.74641373E9|neha@b.com|Neha     |Bais     |[thai, indian, mexican]|MA   |
+--------------------------+---------+------------+----------+---------+---------+-----------------------+-----+



In [369]:
# df1 = dfUser.select("_id", "state", F.initcap("city").alias("city"), F.explode("preference").alias("cuisine_name"))

In [368]:
# df2 = df1 \
#     .select("_id", "state", "city", F.initcap("cuisine_name").alias("cuisine_name")) \
#     .join(dfFromList, ["cuisine_name", "city", "state"], "inner")
# df2.show(1)

In [370]:
# df3 = df2.withColumn("jsonCol", F.to_json(F.struct([x for x in df2.columns if x not in ["_id", "cuisine_name"]]))) \
#     .select(["_id", "cuisine_name", "jsonCol"])

In [367]:
# df4 = df3.groupBy("_id", "cuisine_name").agg(F.collect_list("jsonCol").alias("jsonCol1"))

In [364]:
# df5 = df4.select("_id", F.to_json(F.create_map("cuisine_name", "jsonCol1"), options={None}).alias("Rest"))

In [365]:
# df5.show()

In [317]:
# df6 = df5.groupBy("_id").agg(F.collect_list("Rest").alias("cuisine"))

In [345]:
# df6.show()

In [346]:
# df6.write \
# .format("com.mongodb.spark.sql.DefaultSource") \
# .mode("append") \
# .option("spark.mongodb.output.uri", "mongodb://localhost:27017/hungryApp.restaurantRecommendation") \
# .save()

In [238]:
userArray1 = dfUser.rdd.map(lambda x: (x._id,x.preference)).collect()
for i, row in enumerate(userArray1):
    print(row[0].oid)

61901edc8d57a518b36728a4
619197026a85243051943d49


In [239]:
arr = dfUser.select("preference").collect()
arr

[Row(preference=['thai', 'indian', 'mexican']),
 Row(preference=['thai', 'indian', 'mexican'])]

# -----------------------------

### Writing to <font color="yellow">MongoDB

In [9]:
df.write \
.format("com.mongodb.spark.sql.DefaultSource") \
.mode("overwrite") \
.option("spark.mongodb.output.uri", "mongodb://localhost:27017/hungryApp.restaurantRecommendation") \
.save()