# Leveraging Sentiment Analysis to Improve Airbnb Visualization Maps

## Loading data

In [7]:
import pyspark
import pymongo
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Datacamp Pyspark Tutorial").config(
    "spark.memory.offHeap.enabled", "true").config("spark.memory.offHeap.size", "10g").getOrCreate()
df = spark.read.csv('./data/listings.csv', header=True, escape="\"")
df.show(5, 0)

client = pymongo.MongoClient("localhost", 27017)

print(client)

spark.stop()


RuntimeError: Java gateway process exited before sending its port number

In [2]:
# Import the necessary modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# https://github.com/RWaltersMA/mongo-spark-jupyter
# master("spark://spark-master:7077") --> If you add this master to the builder the master worker stops ...  :(
# Create a SparkSession
spark = SparkSession \
    .builder \
    .appName("pyspark-notebook2") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://mongo:27017") \
    .config("spark.mongodb.output.uri", "mongodb://mongo:27017") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()

rdd = spark.sparkContext.parallelize(range(1, 100))

print("THE SUM IS HERE: ", rdd.sum())

# Stop the SparkSession
spark.stop()


                                                                                

THE SUM IS HERE:  4950


In [3]:
# # Import the necessary modules
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import *

# # https://github.com/RWaltersMA/mongo-spark-jupyter
# # master("spark://spark-master:7077") --> If you add this master to the builder the master worker stops ...  :(
# # Create a SparkSession
# spark = SparkSession \
#     .builder \
#     .appName("Spark NLP") \
#     .config("spark.executor.memory", "1g") \
#     .config("spark.mongodb.input.uri", "mongodb://mongo:27017") \
#     .config("spark.mongodb.output.uri", "mongodb://mongo:27017") \
#     .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
#     .master("local[*]") \
#     .config("spark.driver.memory","16G") \
#     .config("spark.driver.maxResultSize", "0") \
#     .config("spark.kryoserializer.buffer.max", "2000M") \
#     .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3") \
#     .getOrCreate()


In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark NLP") \
    .master("local[*]") \
    .config("spark.driver.memory", "16G") \
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3") \
    .getOrCreate()


RuntimeError: Java gateway process exited before sending its port number

In [2]:
listings = spark.read.csv('./data/listings.csv', header=True, escape="\"")
listings.show(5, 0)

print("Number of rows: ", listings.count())


+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+-----------+
|id    |name                                                                      |host_id|host_name       |neighbourhood_group|neighbourhood                     |latitude         |longitude        |room_type      |price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|license    |
+------+--------------------------------------------------------------------------+-------+----------------+-------------------+----------------------------------+-----------------+-----------------+---------------+-----+--------------+-----------------+-----------+

In [3]:
reviews = spark.read.csv('./data/reviews.csv', header=True, escape="\"")
reviews.show(5, 0)

print("Number of rows: ", reviews.count())


+----------------------+--------------------+----------+-----------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|listing_id            |id                  |date      |reviewer_id|reviewer_name|comments                                                                                                                                       |
+----------------------+--------------------+----------+-----------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|18674                 |4808211             |2013-05-27|4841196    |Caron        |Great location. Clean, spacious flat. Would recommend to anyone.                                                                               |
|18674                 |10660311            |2014-03-02|11600277   |Juan Carlos  |Mi mejor r

In [4]:
# join listings and reviews on the id and listing_id columns
listings_reviews = listings.join(reviews, listings.id == reviews.listing_id)
listings_reviews.show(5, 0)

print("Number of rows: ", listings_reviews.count())


+-----+----------------------------------------------------------------+-------+----------------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+-----------+----------+---------+----------+-----------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id   |name                                                            |host_id|host_name

In [5]:
# keep only id, latitude, longitude, and comments
listings_reviews = listings_reviews.select(
    "listing_id", "latitude", "longitude", "comments")
listings_reviews.show(5, 0)


+----------+--------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|listing_id|latitude|longitude|comments                                                                                                                                                                                                                                                                                                                                                                                                               

## Cleaning the data

## Sentiment Analysis

In [6]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline, LightPipeline
from sparknlp.annotator import (
    UniversalSentenceEncoder,
    SentimentDLModel
)
import pyspark.sql.functions as F

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained("tfhub_use", "en")\
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

sentimentdl = SentimentDLModel.pretrained("sentimentdl_use_twitter", "en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
    stages=[
        documentAssembler,
        use,
        sentimentdl
    ])

text_list = [
    """@Mbjthegreat i really dont want AT&amp;T phone service..they suck when it comes to having a signal""",
    """holy crap. I take a nap for 4 hours and Pitchfork blows up my twitter dashboard. I wish I was at Coachella.""",
    """@Susy412 he is working today  ive tried that still not working..... hmmmm!! im rubbish with computers haha!""",
    """Brand New Canon EOS 50D 15MP DSLR Camera Canon 17-85mm IS Lens ...: Web Technology Thread, Brand New Canon EOS 5.. http://u.mavrev.com/5a3t""",
    """Watching a programme about the life of Hitler, its only enhancing my geekiness of history.""",
    """GM says expects announcment on sale of Hummer soon - Reuters: WDSUGM says expects announcment on sale of Hummer .. http://bit.ly/4E1Fv""",
    """@accannis @edog1203 Great Stanford course. Thanks for making it available to the public! Really helpful and informative for starting off!""",
    """@the_real_usher LeBron is cool.  I like his personality...he has good character.""",
    """@sketchbug Lebron is a hometown hero to me, lol I love the Lakers but let's go Cavs, lol""",
    """@PDubyaD right!!! LOL we'll get there!! I have high expectations, Warren Buffet style.""",
]


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ — ]
An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel.
: java.lang.UnsatisfiedLinkError: no jnitensorflow in java.library.path: /usr/java/packages/lib:/usr/lib/aarch64-linux-gnu/jni:/lib/aarch64-linux-gnu:/usr/lib/aarch64-linux-gnu:/usr/lib/jni:/lib:/usr/lib
	at java.base/java.lang.ClassLoader.loadLibrary(ClassLoader.java:2434)
	at java.base/java.lang.Runtime.loadLibrary0(Runtime.java:818)
	at java.base/java.lang.System.loadLibrary(System.java:1989)
	at org.bytedeco.javacpp.Loader.loadLibrary(Loader.java:1738)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1345)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1157)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1133)
	at org.tensorflow.internal.c_api.global.tensorflow.<clinit>(tensorflow.java:12)
	at java.base/java.lang.Class.forName0(Native Method)
	at java.base/java.lang.Class.forName

Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel.
: java.lang.UnsatisfiedLinkError: no jnitensorflow in java.library.path: /usr/java/packages/lib:/usr/lib/aarch64-linux-gnu/jni:/lib/aarch64-linux-gnu:/usr/lib/aarch64-linux-gnu:/usr/lib/jni:/lib:/usr/lib
	at java.base/java.lang.ClassLoader.loadLibrary(ClassLoader.java:2434)
	at java.base/java.lang.Runtime.loadLibrary0(Runtime.java:818)
	at java.base/java.lang.System.loadLibrary(System.java:1989)
	at org.bytedeco.javacpp.Loader.loadLibrary(Loader.java:1738)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1345)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1157)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1133)
	at org.tensorflow.internal.c_api.global.tensorflow.<clinit>(tensorflow.java:12)
	at java.base/java.lang.Class.forName0(Native Method)
	at java.base/java.lang.Class.forName(Class.java:467)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1212)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1157)
	at org.bytedeco.javacpp.Loader.load(Loader.java:1149)
	at org.tensorflow.NativeLibrary.load(NativeLibrary.java:64)
	at org.tensorflow.TensorFlow.<clinit>(TensorFlow.java:156)
	at java.base/java.lang.Class.forName0(Native Method)
	at java.base/java.lang.Class.forName(Class.java:375)
	at org.tensorflow.Graph.<clinit>(Graph.java:1341)
	at com.johnsnowlabs.ml.tensorflow.TensorflowWrapper$.readGraph(TensorflowWrapper.scala:415)
	at com.johnsnowlabs.ml.tensorflow.TensorflowWrapper$.unpackWithoutBundle(TensorflowWrapper.scala:330)
	at com.johnsnowlabs.ml.tensorflow.TensorflowWrapper$.readWithSP(TensorflowWrapper.scala:542)
	at com.johnsnowlabs.ml.tensorflow.ReadTensorflowModel.readTensorflowWithSPModel(TensorflowSerializeModel.scala:195)
	at com.johnsnowlabs.ml.tensorflow.ReadTensorflowModel.readTensorflowWithSPModel$(TensorflowSerializeModel.scala:162)
	at com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder$.readTensorflowWithSPModel(UniversalSentenceEncoder.scala:380)
	at com.johnsnowlabs.nlp.embeddings.ReadUSEDLModel.readModel(UniversalSentenceEncoder.scala:332)
	at com.johnsnowlabs.nlp.embeddings.ReadUSEDLModel.readModel$(UniversalSentenceEncoder.scala:329)
	at com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder$.readModel(UniversalSentenceEncoder.scala:380)
	at com.johnsnowlabs.nlp.embeddings.ReadUSEDLModel.$anonfun$$init$$1(UniversalSentenceEncoder.scala:336)
	at com.johnsnowlabs.nlp.embeddings.ReadUSEDLModel.$anonfun$$init$$1$adapted(UniversalSentenceEncoder.scala:336)
	at com.johnsnowlabs.nlp.ParamsAndFeaturesReadable.$anonfun$onRead$1(ParamsAndFeaturesReadable.scala:50)
	at com.johnsnowlabs.nlp.ParamsAndFeaturesReadable.$anonfun$onRead$1$adapted(ParamsAndFeaturesReadable.scala:49)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at com.johnsnowlabs.nlp.ParamsAndFeaturesReadable.onRead(ParamsAndFeaturesReadable.scala:49)
	at com.johnsnowlabs.nlp.ParamsAndFeaturesReadable.$anonfun$read$1(ParamsAndFeaturesReadable.scala:61)
	at com.johnsnowlabs.nlp.ParamsAndFeaturesReadable.$anonfun$read$1$adapted(ParamsAndFeaturesReadable.scala:61)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:38)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:24)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:518)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:510)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.downloadModel(ResourceDownloader.scala:709)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel(ResourceDownloader.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.UnsatisfiedLinkError: Could not find jnitensorflow in class, module, and library paths.
	at org.bytedeco.javacpp.Loader.loadLibrary(Loader.java:1705)
	... 51 more


In [None]:
empty_df = spark.createDataFrame([['']]).toDF("text")

model = nlpPipeline.fit(empty_df)

df = spark.createDataFrame(pd.DataFrame({"text":text_list}))
result = model.transform(df)