In [126]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, count, avg, lit, countDistinct, expr, row_number
from pyspark.sql.types import StringType, DoubleType
from textblob import TextBlob
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.sql.window import Window

In [2]:
# Initialize a Spark session
spark = SparkSession.builder.appName("FinalTask")\
    .config('spark.driver.extraClassPath','/usr/lib/jvm/java-11-openjdk-amd64/lib/postgresql-42.5.0.jar')\
        .getOrCreate()

23/09/11 20:21:44 WARN Utils: Your hostname, kushal-Latitude-E5440 resolves to a loopback address: 127.0.1.1; using 192.168.1.14 instead (on interface wlp2s0)
23/09/11 20:21:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/11 20:21:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
listing_df = spark.read.format('jdbc').options(url='jdbc:postgresql://localhost:5432/Final',driver = 'org.postgresql.Driver', dbtable = 'listing', user='postgres',password='kushal2psg').load()
calendar_df = spark.read.format('jdbc').options(url='jdbc:postgresql://localhost:5432/Final',driver = 'org.postgresql.Driver', dbtable = 'calendar', user='postgres',password='kushal2psg').load()
reviews_df = spark.read.format('jdbc').options(url='jdbc:postgresql://localhost:5432/Final',driver = 'org.postgresql.Driver', dbtable = 'reviews', user='postgres',password='kushal2psg').load()

In [4]:
listing_df.show()

                                                                                

+--------+--------------------+--------------------+--------------------+-----------+--------------------+------------------+------------------+--------------------+-----------------+----------+--------------------+----------+-----+-------+-------------+---------------+------------+---------+--------+-----+-----------------+
|      id|         listing_url|                name|               space|  host_name|       host_location|host_response_time|host_response_rate|host_acceptance_rate|host_is_superhost|host_since|              street|      city|state|zipcode|property_type|      room_type|accommodates|bathrooms|bedrooms|price|number_of_reviews|
+--------+--------------------+--------------------+--------------------+-----------+--------------------+------------------+------------------+--------------------+-----------------+----------+--------------------+----------+-----+-------+-------------+---------------+------------+---------+--------+-----+-----------------+
|12147973|https://w

In [5]:
calendar_df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+----------+----------+---------+-----+
|listing_id|      date|available|price|
+----------+----------+---------+-----+
|  14857569|2017-07-30|    false| null|
|  14857569|2017-07-29|    false| null|
|  14857569|2017-07-28|    false| null|
|  14857569|2017-07-27|    false| null|
|  14857569|2017-07-26|    false| null|
|  14857569|2017-07-25|    false| null|
|  14857569|2017-07-24|    false| null|
|  14857569|2017-07-23|    false| null|
|  14857569|2017-07-22|    false| null|
|  14857569|2017-07-21|    false| null|
|  14857569|2017-07-20|    false| null|
|  14857569|2017-07-19|    false| null|
|  14857569|2017-07-18|    false| null|
|  14857569|2017-07-17|    false| null|
|  14857569|2017-07-16|    false| null|
|  14857569|2017-07-15|    false| null|
|  14857569|2017-07-14|    false| null|
|  14857569|2017-07-13|    false| null|
|  14857569|2017-07-12|    false| null|
|  14857569|2017-07-11|    false| null|
+----------+----------+---------+-----+
only showing top 20 rows



                                                                                

In [6]:
reviews_df.printSchema()

root
 |-- listing_id: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- reviewer_id: integer (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



In [7]:
reviews_df.show()

+----------+--------+----------+-----------+-------------+--------------------+
|listing_id|      id|      date|reviewer_id|reviewer_name|            comments|
+----------+--------+----------+-----------+-------------+--------------------+
|   8595650|99096335|2016-09-03|   74253712|        Peter|The area the apar...|
|  13449049|79825811|2016-06-14|   10383854|          Ali|The host canceled...|
|  13449049|80893606|2016-06-20|    2766707|        Chris|Lilly is a great ...|
|  13449049|81472725|2016-06-23|   29910089|         Adam|Lily was extremel...|
|   6465075|33541745|2015-05-29|   15813117|       Lauren|Caitlin's apartme...|
|   6465075|33953200|2015-06-02|   33202230|       Hyobin|Catlin was very h...|
|   6465075|34086562|2015-06-04|   16569306|        Bryan|A++++++ host. May...|
|   6465075|35484176|2015-06-19|    5830321|  Joshua Amor|A wonderful and c...|
|   6465075|36209339|2015-06-26|   34825134|      Carolyn|Beautiful condo -...|
|   6465075|45371507|2015-09-02|   33921

                                                                                

## Task 1 Get Distinct Street names and it's best listing based on analysis of price, reviews and booking

In [8]:
#firstly lets calculate reviews rating by using sentiment analysis
# Define a function to perform sentiment analysis using TextBlob
def analyze_sentiment(comment):
    if comment is not None and isinstance(comment, str):
        analysis = TextBlob(comment)
        # Classify sentiment as positive, neutral, or negative based on polarity
        if analysis.sentiment.polarity > 0:
            return "positive"
        elif analysis.sentiment.polarity == 0:
            return "neutral"
        else:
            return "negative"
    else:
        return None

In [9]:
# Register the UDF
sentiment_analysis_udf = udf(analyze_sentiment, StringType())

In [10]:
# analyze the sentiment of review and keep it under sentiment column in a new df
reviews_sentiment = reviews_df.withColumn("sentiment", sentiment_analysis_udf(col("comments")))

In [11]:
reviews_sentiment.show()

[Stage 3:>                                                          (0 + 1) / 1]

+----------+--------+----------+-----------+-------------+--------------------+---------+
|listing_id|      id|      date|reviewer_id|reviewer_name|            comments|sentiment|
+----------+--------+----------+-----------+-------------+--------------------+---------+
|   8595650|99096335|2016-09-03|   74253712|        Peter|The area the apar...| positive|
|  13449049|79825811|2016-06-14|   10383854|          Ali|The host canceled...|  neutral|
|  13449049|80893606|2016-06-20|    2766707|        Chris|Lilly is a great ...| positive|
|  13449049|81472725|2016-06-23|   29910089|         Adam|Lily was extremel...| positive|
|   6465075|33541745|2015-05-29|   15813117|       Lauren|Caitlin's apartme...| positive|
|   6465075|33953200|2015-06-02|   33202230|       Hyobin|Catlin was very h...| positive|
|   6465075|34086562|2015-06-04|   16569306|        Bryan|A++++++ host. May...| positive|
|   6465075|35484176|2015-06-19|    5830321|  Joshua Amor|A wonderful and c...| positive|
|   646507

                                                                                

In [15]:
reviews_sentiment1 = reviews_sentiment.groupBy("listing_id")\
    .agg(count(col("sentiment")=="positive"). alias("total_positive_reviews"), count("*").alias("total_reviews_count"))

In [19]:
final_review = reviews_sentiment1\
    .withColumn("Review_rate", col("total_positive_reviews")/col("total_reviews_count"))

In [112]:
final_review = final_review.drop('total_positive_reviews', 'total_reviews_count')

In [113]:
final_review.where(col("Review_rate") < 0.95).show()

[Stage 104:>                                                        (0 + 1) / 1]

+----------+------------------+
|listing_id|       Review_rate|
+----------+------------------+
|  12655758|0.8888888888888888|
|  10705351|0.9333333333333333|
|   2077794|0.9285714285714286|
|  14300751|0.6666666666666666|
|  13421289|0.9230769230769231|
|  13592046|0.8888888888888888|
|  13490438|               0.5|
|   8490356|0.8888888888888888|
|   8789821|0.8571428571428571|
|  12540640|              0.75|
|  13081347|0.6666666666666666|
|  14043390|              0.75|
|  12103501|              0.75|
|   8200839|0.9473684210526315|
+----------+------------------+



                                                                                

In [53]:
calendar_1 = calendar_df.groupBy("listing_id")\
    .agg(count(expr("CASE WHEN available = False THEN 1 ELSE NULL END")).alias("booked_days"))

In [57]:
calendar_1.show()

[Stage 70:>                                                         (0 + 1) / 1]

+----------+-----------+
|listing_id|booked_days|
+----------+-----------+
|   3068453|         66|
|   6911945|        363|
|  13397201|        331|
|   8165898|        186|
|  10036037|        248|
|   9410831|        365|
|   8036024|        296|
|   7825327|         30|
|  14219033|        210|
|  13151270|          0|
|   2754975|        111|
|  14918869|          0|
|   8303267|         32|
|  13251243|        158|
|   2513870|        365|
|  13602808|        310|
|   8211468|         55|
|   4149974|         51|
|  14322091|        352|
|   9238963|        365|
+----------+-----------+
only showing top 20 rows



                                                                                

In [61]:
vector_assembler = VectorAssembler(inputCols=["booked_days"], outputCol="booked_days_vec")
calendar_1 = vector_assembler.transform(calendar_1)

In [62]:
scaler = MinMaxScaler(inputCol="booked_days_vec", outputCol="normalized_booked_days")

scaler_model = scaler.fit(calendar_1)
normalized_calendar = scaler_model.transform(calendar_1)

                                                                                

In [69]:
normalized_calendar.printSchema()

root
 |-- listing_id: integer (nullable = true)
 |-- booked_days: long (nullable = false)
 |-- booked_days_vec: vector (nullable = true)
 |-- normalized_booked_days: vector (nullable = true)



In [75]:
unlist = udf(lambda x: float(list(x)[0]), DoubleType())

In [76]:
normalized_calendar = normalized_calendar.withColumn("nbd_new", unlist("normalized_booked_days"))

In [79]:
normalized_calendar.printSchema()

root
 |-- listing_id: integer (nullable = true)
 |-- booked_days: long (nullable = false)
 |-- booked_days_vec: vector (nullable = true)
 |-- normalized_booked_days: vector (nullable = true)
 |-- nbd_new: double (nullable = true)



In [86]:
final_calendar = normalized_calendar.select("listing_id", col("nbd_new").alias("booked_days"))

In [87]:
final_calendar.show()

[Stage 90:>                                                         (0 + 1) / 1]

+----------+--------------------+
|listing_id|         booked_days|
+----------+--------------------+
|   3068453| 0.11578947368421053|
|   6911945|  0.6368421052631579|
|  13397201|  0.5807017543859649|
|   8165898|  0.3263157894736842|
|  10036037| 0.43508771929824563|
|   9410831|  0.6403508771929824|
|   8036024|  0.5192982456140351|
|   7825327|0.052631578947368425|
|  14219033|  0.3684210526315789|
|  13151270|                 0.0|
|   2754975| 0.19473684210526315|
|  14918869|                 0.0|
|   8303267|0.056140350877192984|
|  13251243|  0.2771929824561404|
|   2513870|  0.6403508771929824|
|  13602808|   0.543859649122807|
|   8211468| 0.09649122807017545|
|   4149974| 0.08947368421052632|
|  14322091|  0.6175438596491228|
|   9238963|  0.6403508771929824|
+----------+--------------------+
only showing top 20 rows



                                                                                

In [94]:
listing1 = listing_df.select("id", "name", "street", "property_type", "price")

In [97]:
listing1 = listing1.withColumn("price", col("price").cast("int"))

In [98]:
listing1.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- street: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- price: integer (nullable = true)



In [99]:
vector_assembler1 = VectorAssembler(inputCols=["price"], outputCol="price_vec")
listing1 = vector_assembler1.transform(listing1)

In [101]:
scaler1 = MinMaxScaler(inputCol="price_vec", outputCol="lux_price_vec")

scaler_model1 = scaler1.fit(listing1)
normalized_listing = scaler_model1.transform(listing1)

In [106]:
normalized_listing = normalized_listing.withColumn("lux_price", unlist("lux_price_vec"))

In [108]:
normalized_listing = normalized_listing.drop('price_vec', 'lux_price_vec')

In [110]:
final_listing = normalized_listing.withColumn("bud_price", 1-col("lux_price"))

In [111]:
final_listing.show()

[Stage 103:>                                                        (0 + 1) / 1]

+--------+--------------------+--------------------+-------------+-----+--------------------+------------------+
|      id|                name|              street|property_type|price|           lux_price|         bud_price|
+--------+--------------------+--------------------+-------------+-----+--------------------+------------------+
|12147973|Sunny Bungalow in...|Birch Street Bost...|        House|25000| 0.05778894472361809|0.9422110552763819|
| 3075044|Charming room in ...|Pinehurst Street ...|    Apartment| 6500|0.011306532663316583|0.9886934673366834|
|    6976|Mexican Folk Art ...|Ardale St. Boston...|    Apartment| 6500|0.011306532663316583|0.9886934673366834|
| 1436513|Spacious Sunny Be...|Boston MA United ...|        House| 7500|0.013819095477386936|0.9861809045226131|
| 7651065| Come Home to Boston|Durnell Avenue Bo...|        House| 7900|0.014824120603015077|0.9851758793969849|
|12386020|Private Bedroom +...|Walter Street Bos...|  Condominium| 7500|0.013819095477386936|0.9

                                                                                

In [114]:
ini_join = final_review.join(final_calendar, "listing_id")

In [116]:
final_join = final_listing.join(ini_join, final_listing["id"] == ini_join["listing_id"])

In [118]:
final_join = final_join.drop('listing_id')

In [119]:
final_join.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- street: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- lux_price: double (nullable = true)
 |-- bud_price: double (nullable = true)
 |-- Review_rate: double (nullable = true)
 |-- booked_days: double (nullable = true)



In [136]:
final_join_lux = final_join.withColumn("Overal_score", col("lux_price")+col("Review_rate")+col("booked_days"))

In [137]:
final_join_lux1 = final_join_lux.drop('lux_price', 'bud_price', 'Review_rate', 'booked_days')

In [138]:
final_join_lux1.show()

[Stage 177:>                                                        (0 + 1) / 1]

+--------+--------------------+--------------------+-------------+------+------------------+
|      id|                name|              street|property_type| price|      Overal_score|
+--------+--------------------+--------------------+-------------+------+------------------+
|14219033|Updated large bed...|Jacob Street Bost...|    Townhouse|  5900|1.3782200476064534|
|10036037|New Luxury 2BR-Ap...|Boylston Street B...|    Apartment| 19900|1.4800625936701048|
| 6911945|Lovely clean 1B h...|Bowen Street Bost...|  Condominium| 17000|1.6745305474742131|
| 7825327|Brown Stone Apt #...|Tremont Street Bo...|    Apartment| 35000|  1.13554615181169|
| 2754975|Perfect Location ...|Beacon Street Bos...|    Apartment| 25000|1.2525257868288813|
| 3068453|Large Studio in T...|Tremont Street Bo...|    Apartment| 13300|1.1441814334832054|
| 8036024|Upscale Faneuil H...|North Washington ...|  Condominium| 15000|1.5519615621969498|
| 3873776|Large Sunny Beaco...|Charles Street Bo...|    Apartment| 325

                                                                                

In [139]:
windowSpec = Window.partitionBy("street").orderBy(col("Overal_score").desc())

In [140]:
final_join_lux1.withColumn("row_number", row_number().over(windowSpec)).show()

[Stage 186:>                                                        (0 + 1) / 1]

+--------+--------------------+--------------------+-------------+-----+------------------+----------+
|      id|                name|              street|property_type|price|      Overal_score|row_number|
+--------+--------------------+--------------------+-------------+-----+------------------+----------+
| 8454985|Modern Navy Yard ...|13th St Charlesto...|    Apartment|22500|1.4111566604954597|         1|
| 2167993|Boston Waterfront...|13th Street Bosto...|    Apartment|14900|1.0955699550383495|         1|
|12915510|It's always more ...|13th Street Bosto...|         Boat|22900|1.0525125628140704|         2|
| 5684598|Navy Yard Condo o...|1st Avenue Boston...|    Apartment|19900| 1.250238032266596|         1|
| 5481243|Private room in B...|1st Avenue Charle...|  Condominium|15000|1.1431896323723882|         1|
| 4532603|Heart of Boston -...|A Garden St Bosto...|    Apartment|22900| 1.170056422463193|         1|
|14536322|Victorian Garden ...|A Mount Auburn St...|        House| 8500|1

                                                                                