# Leveraging Sentiment Analysis to Improve Airbnb Visualization Maps

## Loading data

In [14]:
import pyspark
import pymongo
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Datacamp Pyspark Tutorial").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","10g").getOrCreate()
df = spark.read.csv('./data/listings.csv', header=True, escape="\"")
df.show(5,0)

client = pymongo.MongoClient("localhost", 27017)

print(client)


+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+-----------+
|id    |name                                                                      |host_id|host_name       |neighbourhood_group|neighbourhood                     |latitude         |longitude        |room_type      |price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|license    |
+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+

In [15]:
# Import the necessary modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# https://github.com/RWaltersMA/mongo-spark-jupyter
# master("spark://spark-master:7077") --> If you add this master to the builder the master worker stops ...  :(
# Create a SparkSession
spark = SparkSession \
    .builder \
    .appName("pyspark-notebook2") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://mongo:27017") \
    .config("spark.mongodb.output.uri", "mongodb://mongo:27017") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()

rdd = spark.sparkContext.parallelize(range(1, 100))

print("THE SUM IS HERE: ", rdd.sum())
# Stop the SparkSession
spark.stop()

THE SUM IS HERE:  4950


In [3]:
# Import the necessary modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# https://github.com/RWaltersMA/mongo-spark-jupyter
# master("spark://spark-master:7077") --> If you add this master to the builder the master worker stops ...  :(
# Create a SparkSession
spark = SparkSession \
    .builder \
    .appName("pyspark-notebook2") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://mongo:27017") \
    .config("spark.mongodb.output.uri", "mongodb://mongo:27017") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()


In [4]:
listings = spark.read.csv('./data/listings.csv', header=True, escape="\"")
listings.show(5, 0)

print("Number of rows: ", listings.count())


+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+-----------+
|id    |name                                                                      |host_id|host_name       |neighbourhood_group|neighbourhood                     |latitude         |longitude        |room_type      |price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|license    |
+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+

In [5]:
reviews = spark.read.csv('./data/reviews.csv', header=True, escape="\"")
reviews.show(5, 0)

print("Number of rows: ", reviews.count())


AnalysisException: Path does not exist: file:/opt/workspace/data/reviews.csv;

In [7]:
# join listings and reviews on the id and listing_id columns
listings_reviews = listings.join(reviews, listings.id == reviews.listing_id)
listings_reviews.show(5, 0)

print("Number of rows: ", listings_reviews.count())


+-----+----------------------------------------------------------------+-------+----------------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+-----------+----------+---------+----------+-----------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id   |name                                                            |host_id|host_name

In [8]:
# keep only id, latitude, longitude, and comments
listings_reviews = listings_reviews.select(
    "listing_id", "latitude", "longitude", "comments")
listings_reviews.show(5, 0)


+----------+--------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|listing_id|latitude|longitude|comments                                                                                                                                                                                                                                                                                                                                                                                                               

## Cleaning the data

## Sentiment Analysis

In [16]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    UniversalSentenceEncoder,
    SentimentDLModel
)
import pyspark.sql.functions as F

documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

use = UniversalSentenceEncoder.pretrained("tfhub_use", "en")\
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

sentimentdl = SentimentDLModel.pretrained("sentimentdl_use_twitter", "en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
    stages=[
        documentAssembler,
        use,
        sentimentdl
    ])

text_list = [
    """@Mbjthegreat i really dont want AT&amp;T phone service..they suck when it comes to having a signal""",
    """holy crap. I take a nap for 4 hours and Pitchfork blows up my twitter dashboard. I wish I was at Coachella.""",
    """@Susy412 he is working today  ive tried that still not working..... hmmmm!! im rubbish with computers haha!""",
    """Brand New Canon EOS 50D 15MP DSLR Camera Canon 17-85mm IS Lens ...: Web Technology Thread, Brand New Canon EOS 5.. http://u.mavrev.com/5a3t""",
    """Watching a programme about the life of Hitler, its only enhancing my geekiness of history.""",
    """GM says expects announcment on sale of Hummer soon - Reuters: WDSUGM says expects announcment on sale of Hummer .. http://bit.ly/4E1Fv""",
    """@accannis @edog1203 Great Stanford course. Thanks for making it available to the public! Really helpful and informative for starting off!""",
    """@the_real_usher LeBron is cool.  I like his personality...he has good character.""",
    """@sketchbug Lebron is a hometown hero to me, lol I love the Lakers but let's go Cavs, lol""",
    """@PDubyaD right!!! LOL we'll get there!! I have high expectations, Warren Buffet style.""",
]


TypeError: 'JavaPackage' object is not callable

In [None]:
empty_df = spark.createDataFrame([['']]).toDF("text")

model = nlpPipeline.fit(empty_df)

df = spark.createDataFrame(pd.DataFrame({"text":text_list}))
result = model.transform(df)