# Leveraging Sentiment Analysis to Improve Airbnb Visualization Maps

## Loading data

In [17]:
import pyspark
import pymongo
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Datacamp Pyspark Tutorial").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","10g").getOrCreate()
df = spark.read.csv('./data/listings.csv', header=True, escape="\"")
df.show(5,0)

client = pymongo.MongoClient("localhost", 27017)

print(client)


+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+-----------+
|id    |name                                                                      |host_id|host_name       |neighbourhood_group|neighbourhood                     |latitude         |longitude        |room_type      |price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|license    |
+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+

In [18]:
# Import the necessary modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# https://github.com/RWaltersMA/mongo-spark-jupyter
# master("spark://spark-master:7077") --> If you add this master to the builder the master worker stops ...  :(
# Create a SparkSession
spark = SparkSession \
    .builder \
    .appName("pyspark-notebook2") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://mongo:27017") \
    .config("spark.mongodb.output.uri", "mongodb://mongo:27017") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()

rdd = spark.sparkContext.parallelize(range(1, 100))

print("THE SUM IS HERE: ", rdd.sum())
# Stop the SparkSession
spark.stop()

THE SUM IS HERE:  4950


In [22]:
# Import the necessary modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# https://github.com/RWaltersMA/mongo-spark-jupyter
# master("spark://spark-master:7077") --> If you add this master to the builder the master worker stops ...  :(
# Create a SparkSession
spark = SparkSession \
    .builder \
    .appName("pyspark-notebook2") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://mongo:27017") \
    .config("spark.mongodb.output.uri", "mongodb://mongo:27017") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()


In [33]:
listings = spark.read.csv('./data/listings.csv', header=True, escape="\"")
listings.show(5,0)

print("Number of rows: ", listings.count())

+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+-----------+
|id    |name                                                                      |host_id|host_name       |neighbourhood_group|neighbourhood                     |latitude         |longitude        |room_type      |price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|license    |
+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+

In [34]:
reviews = spark.read.csv('./data/reviews.csv', header=True, escape="\"")
reviews.show(5,0)

print("Number of rows: ", reviews.count())

+----------------------+--------------------+----------+-----------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|listing_id            |id                  |date      |reviewer_id|reviewer_name|comments                                                                                                                                       |
+----------------------+--------------------+----------+-----------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|18674                 |4808211             |2013-05-27|4841196    |Caron        |Great location. Clean, spacious flat. Would recommend to anyone.                                                                               |
|18674                 |10660311            |2014-03-02|11600277   |Juan Carlos  |Mi mejor r

In [36]:
# join listings and reviews on the id and listing_id columns
listings_reviews = listings.join(reviews, listings.id == reviews.listing_id)
listings_reviews.show(5,0)

print("Number of rows: ", listings_reviews.count())

+-----+----------------------------------------------------------------+-------+----------------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+-----------+----------+---------+----------+-----------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id   |name                                                            |host_id|host_name

In [38]:
# keep only id, latitude, longitude, and comments
listings_reviews = listings_reviews.select("listing_id", "latitude", "longitude", "comments")
listings_reviews.show(5,0)

+----------+--------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|listing_id|latitude|longitude|comments                                                                                                                                                                                                                                                                                                                                                                                                               

## Cleaning the data

## Sentiment Analysis