### Sentiment Analysis of Johnson & Johnson's Janssen vaccine tweets US

**Original Author:** Elena Stamatelou.<br/>
**Additional Info:** Sentiment analysis on streaming twitter data using Spark Structured Streaming & Python. https://github.com/stamatelou/twitter_sentiment_analysis<br/>
**Last Modified:**  

In [1]:
# Import the os module 
import os

# Set the PYSPARK_SUBMIT_ARGS to the appropriate spark-sql-kafka package
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1 pyspark-shell'

In [2]:
# Install textblob for the sentiment analysis
import sys
!{sys.executable} -m pip install -U textblob

Collecting textblob
  Using cached textblob-0.15.3-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [3]:
# Import Sparksession and sql functions 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob


In [4]:
# text classification

# Define methods from TextBlob
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity

def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity

def sentiment_detection(value):
    if value < 0: 
        return 'Negative'
    elif value > 0: 
        return 'Positive'
    else:
        return 'Neutral'

# polarity detection
# Define as user defined fuction to embed method in the spark environment 
polarity_detection_udf = udf(polarity_detection, StringType())

# subjectivity detection
# Define as user defined fuction to embed method in the spark environment 
subjectivity_detection_udf = udf(subjectivity_detection, StringType())

# sentiment detection
# Define as user defined fuction to embed method in the spark environment 
sentiment_detection_udf = udf(sentiment_detection, StringType())

In [5]:
# Import the findspark module 
import findspark

# Initialize via the full spark path
findspark.init("/usr/local/spark/")

In [6]:
# create Spark session
spark = SparkSession.builder \
   .master("local[*]") \
   .appName("TwitterSentAnalysis") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()

In [7]:
try: 
    # Read Tweets from the Kafka topic vaccine 
    tweet_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "127.0.0.1:9092") \
        .option("subscribe", "johnsonandjohnson_US") \
        .option("startingOffsets", "latest") \
        .load()
except: 
    print("Unexpected error:", sys.exc_info()[0])

In [8]:
try: 
    # Cast the data into a json
    tweet_df_string = tweet_df.selectExpr("CAST(value AS STRING) as json_data")
    
    # extract the tweet and user info
    text_user = tweet_df_string.select(json_tuple('json_data', 'created_at','text', 'user').alias('created_at', 'text', 'json_user')) 
    
    # extract screen_name and location from user info
    text_user_info = text_user.select('text', 'created_at', json_tuple('json_user', 'location').alias('location')) 
    
    # preprocessing
    text_user_info = text_user_info.na.replace('', 'None')
    text_user_info = text_user_info.na.drop()
    
    text_user_info = text_user_info.withColumn('text', F.regexp_replace('text', r'http\S+', ''))
    text_user_info = text_user_info.withColumn('text', F.regexp_replace('text', '@\w+', ''))
    text_user_info = text_user_info.withColumn('text', F.regexp_replace('text', '#', ''))
    text_user_info = text_user_info.withColumn('text', F.regexp_replace('text', 'RT', ''))
    text_user_info = text_user_info.withColumn('text', F.regexp_replace('text', ':', ''))

    # polarity detection
    # Append polarity to dataframe
    text_user_info = text_user_info.withColumn("polarity", polarity_detection_udf(text_user_info.text))
    
    # subjectivity detection
    # Append subjectivity to dataframe 
    text_user_info = text_user_info.withColumn("subjectivity", subjectivity_detection_udf(text_user_info.text))
    
    #sentiment detection
    # Append sentiment to dataframe
    text_user_info = text_user_info.withColumn("sentiment", sentiment_detection_udf(text_user_info.polarity))
    
    # repartition 'Returns a new DataFrame partitioned by the given partitioning expressions. The resulting DataFrame is hash partitioned.'
    text_user_info = text_user_info.repartition(1)
    
    # country filter 
    text_user_info = text_user_info.filter("location == 'USA' OR location LIKE 'USA %' OR location LIKE '% USA' OR location LIKE '% USA %' \
                  OR location = 'US' OR location LIKE 'US %' OR location LIKE '% US' OR location LIKE '% US %' \
                  OR location = 'U.S.' OR location LIKE 'U.S. %' OR location LIKE '% U.S.' OR location LIKE '% U.S. %' \
                  OR location = 'U.S.A.' OR location LIKE 'U.S.A. %' OR location LIKE '% U.S.A.' OR location LIKE '% U.S.A. %' \
                  OR location = 'United States' OR location LIKE 'United States %' OR location LIKE '% United States' OR location LIKE '% United States %' \
                  OR location = 'United States of America' OR location LIKE 'United States of America %' OR location LIKE '% United States of America' OR location LIKE '% United States of America %' \
                  OR location = 'united states' OR location LIKE 'united states %' OR location LIKE '% united states' OR location LIKE '% united states %' \
                  OR location = 'united states of america' OR location LIKE 'united states of america %' OR location LIKE '% united states of america' OR location LIKE '% united states of america %' \
                  OR location = 'AL' OR location LIKE 'AL %' OR location LIKE '% AL' OR location LIKE '% AL %' \
                  OR location = 'Alabama' OR location LIKE 'Alabama %' OR location LIKE '% Alabama' OR location LIKE '% Alabama %' \
                  OR location = 'AK' OR location LIKE 'AK %' OR location LIKE '% AK' OR location LIKE '% AK %' \
                  OR location = 'Alaska' OR location LIKE 'Alaska %' OR location LIKE '% Alaska' OR location LIKE '% Alaska %' \
                  OR location = 'AZ' OR location LIKE 'AZ %' OR location LIKE '% AZ' OR location LIKE '% AZ %' \
                  OR location = 'Arizona' OR location LIKE 'Arizona %' OR location LIKE '% Arizona' OR location LIKE '% Arizona %' \
                  OR location = 'AR' OR location LIKE 'AR %' OR location LIKE '% AR' OR location LIKE '% AR %' \
                  OR location = 'Arkansas' OR location LIKE 'Arkansas %' OR location LIKE '% Arkansas' OR location LIKE '% Arkansas %' \
                  OR location = 'CA' OR location LIKE 'CA %' OR location LIKE '% CA' OR location LIKE '% CA %' \
                  OR location = 'California' OR location LIKE 'California %' OR location LIKE '% California' OR location LIKE '% California %' \
                  OR location = 'CO' OR location LIKE 'CO %' OR location LIKE '% CO' OR location LIKE '% CO %' \
                  OR location = 'Colorado' OR location LIKE 'Colorado %' OR location LIKE '% Colorado' OR location LIKE '% Colorado %' \
                  OR location = 'CT' OR location LIKE 'CT %' OR location LIKE '% CT' OR location LIKE '% CT %' \
                  OR location = 'Connecticut' OR location LIKE 'Connecticut %' OR location LIKE '% Connecticut' OR location LIKE '% Connecticut %' \
                  OR location = 'DE' OR location LIKE 'DE %' OR location LIKE '% DE' OR location LIKE '% DE %' \
                  OR location = 'Delaware' OR location LIKE 'Delaware %' OR location LIKE '% Delaware' OR location LIKE '% Delaware %' \
                  OR location = 'FL' OR location LIKE 'FL %' OR location LIKE '% FL' OR location LIKE '% FL %' \
                  OR location = 'Florida' OR location LIKE 'Florida %' OR location LIKE '% Florida' OR location LIKE '% Florida %' \
                  OR location = 'GA' OR location LIKE 'GA %' OR location LIKE '% GA' OR location LIKE '% GA %' \
                  OR location = 'Georgia' OR location LIKE 'Georgia %' OR location LIKE '% Georgia' OR location LIKE '% Georgia %' \
                  OR location = 'HI' OR location LIKE 'HI %' OR location LIKE '% HI' OR location LIKE '% HI %' \
                  OR location = 'Hawaii' OR location LIKE 'Hawaii %' OR location LIKE '% Hawaii' OR location LIKE '% Hawaii %' \
                  OR location = 'ID' OR location LIKE 'ID %' OR location LIKE '% ID' OR location LIKE '% ID %' \
                  OR location = 'Idaho' OR location LIKE 'Idaho %' OR location LIKE '% Idaho' OR location LIKE '% Idaho %' \
                  OR location = 'IL' OR location LIKE 'IL %' OR location LIKE '% IL' OR location LIKE '% IL %' \
                  OR location = 'Illinois' OR location LIKE 'Illinois %' OR location LIKE '% Illinois' OR location LIKE '% Illinois %' \
                  OR location = 'IN' OR location LIKE 'IN %' OR location LIKE '% IN' OR location LIKE '% IN %' \
                  OR location = 'Indiana' OR location LIKE 'Indiana %' OR location LIKE '% Indiana' OR location LIKE '% Indiana %' \
                  OR location = 'IA' OR location LIKE 'IA %' OR location LIKE '% IA' OR location LIKE '% IA %' \
                  OR location = 'Iowa' OR location LIKE 'Iowa %' OR location LIKE '% Iowa' OR location LIKE '% Iowa %' \
                  OR location = 'KS' OR location LIKE 'KS %' OR location LIKE '% KS' OR location LIKE '% KS %' \
                  OR location = 'Kansas' OR location LIKE 'Kansas %' OR location LIKE '% Kansas' OR location LIKE '% Kansas %' \
                  OR location = 'KY' OR location LIKE 'KY %' OR location LIKE '% KY' OR location LIKE '% KY %' \
                  OR location = 'Kentucky' OR location LIKE 'Kentucky %' OR location LIKE '% Kentucky' OR location LIKE '% Kentucky %' \
                  OR location = 'LA' OR location LIKE 'LA %' OR location LIKE '% LA' OR location LIKE '% LA %' \
                  OR location = 'Louisiana' OR location LIKE 'Louisiana %' OR location LIKE '% Louisiana' OR location LIKE '% Louisiana %' \
                  OR location = 'ME' OR location LIKE 'ME %' OR location LIKE '% ME' OR location LIKE '% ME %' \
                  OR location = 'Maine' OR location LIKE 'Maine %' OR location LIKE '% Maine' OR location LIKE '% Maine %' \
                  OR location = 'MD' OR location LIKE 'MD %' OR location LIKE '% MD' OR location LIKE '% MD %' \
                  OR location = 'Maryland' OR location LIKE 'Maryland %' OR location LIKE '% Maryland' OR location LIKE '% Maryland %' \
                  OR location = 'MA' OR location LIKE 'MA %' OR location LIKE '% MA' OR location LIKE '% MA %' \
                  OR location = 'Massachusetts' OR location LIKE 'Massachusetts %' OR location LIKE '% Massachusetts' OR location LIKE '% Massachusetts %' \
                  OR location = 'MI' OR location LIKE 'MI %' OR location LIKE '% MI' OR location LIKE '% MI %' \
                  OR location = 'Michigan' OR location LIKE 'Michigan %' OR location LIKE '% Michigan' OR location LIKE '% Michigan %' \
                  OR location = 'MN' OR location LIKE 'MN %' OR location LIKE '% MN' OR location LIKE '% MN %' \
                  OR location = 'Minnesota' OR location LIKE 'Minnesota %' OR location LIKE '% Minnesota' OR location LIKE '% Minnesota %' \
                  OR location = 'MS' OR location LIKE 'MS %' OR location LIKE '% MS' OR location LIKE '% MS %' \
                  OR location = 'Mississippi' OR location LIKE 'Mississippi %' OR location LIKE '% Mississippi' OR location LIKE '% Mississippi %' \
                  OR location = 'MO' OR location LIKE 'MO %' OR location LIKE '% MO' OR location LIKE '% MO %' \
                  OR location = 'Missouri' OR location LIKE 'Missouri %' OR location LIKE '% Missouri' OR location LIKE '% Missouri %' \
                  OR location = 'MT' OR location LIKE 'MT %' OR location LIKE '% MT' OR location LIKE '% MT %' \
                  OR location = 'Montana' OR location LIKE 'Montana %' OR location LIKE '% Montana' OR location LIKE '% Montana %' \
                  OR location = 'NE' OR location LIKE 'NE %' OR location LIKE '% NE' OR location LIKE '% NE %' \
                  OR location = 'Nebraska' OR location LIKE 'Nebraska %' OR location LIKE '% Nebraska' OR location LIKE '% Nebraska %' \
                  OR location = 'NV' OR location LIKE 'NV %' OR location LIKE '% NV' OR location LIKE '% NV %' \
                  OR location = 'Nevada' OR location LIKE 'Nevada %' OR location LIKE '% Nevada' OR location LIKE '% Nevada %' \
                  OR location = 'NH' OR location LIKE 'NH %' OR location LIKE '% NH' OR location LIKE '% NH %' \
                  OR location = 'New Hampshire' OR location LIKE 'New Hampshire %' OR location LIKE '% New Hampshire' OR location LIKE '% New Hampshire %' \
                  OR location = 'NJ' OR location LIKE 'NJ %' OR location LIKE '% NJ' OR location LIKE '% NJ %' \
                  OR location = 'New Jersey' OR location LIKE 'New Jersey %' OR location LIKE '% New Jersey' OR location LIKE '% New Jersey %' \
                  OR location = 'NM' OR location LIKE 'NM %' OR location LIKE '% NM' OR location LIKE '% NM %' \
                  OR location = 'New Mexico' OR location LIKE 'New Mexico %' OR location LIKE '% New Mexico' OR location LIKE '% New Mexico %' \
                  OR location = 'NY' OR location LIKE 'NY %' OR location LIKE '% NY' OR location LIKE '% NY %' \
                  OR location = 'New York' OR location LIKE 'New York %' OR location LIKE '% New York' OR location LIKE '% New York %' \
                  OR location = 'NC' OR location LIKE 'NC %' OR location LIKE '% NC' OR location LIKE '% NC %' \
                  OR location = 'North Carolina' OR location LIKE 'North Carolina %' OR location LIKE '% North Carolina' OR location LIKE '% North Carolina %' \
                  OR location = 'ND' OR location LIKE 'ND %' OR location LIKE '% ND' OR location LIKE '% ND %' \
                  OR location = 'North Dakota' OR location LIKE 'North Dakota %' OR location LIKE '% North Dakota' OR location LIKE '% North Dakota %' \
                  OR location = 'OH' OR location LIKE 'OH %' OR location LIKE '% OH' OR location LIKE '% OH %' \
                  OR location = 'Ohio' OR location LIKE 'Ohio %' OR location LIKE '% Ohio' OR location LIKE '% Ohio %' \
                  OR location = 'OK' OR location LIKE 'OK %' OR location LIKE '% OK' OR location LIKE '% OK %' \
                  OR location = 'Oklahoma' OR location LIKE 'Oklahoma %' OR location LIKE '% Oklahoma' OR location LIKE '% Oklahoma %' \
                  OR location = 'OR' OR location LIKE 'OR %' OR location LIKE '% OR' OR location LIKE '% OR %' \
                  OR location = 'Oregon' OR location LIKE 'Oregon %' OR location LIKE '% Oregon' OR location LIKE '% Oregon %' \
                  OR location = 'PA' OR location LIKE 'PA %' OR location LIKE '% PA' OR location LIKE '% PA %' \
                  OR location = 'Pennsylvania' OR location LIKE 'Pennsylvania %' OR location LIKE '% Pennsylvania' OR location LIKE '% Pennsylvania %' \
                  OR location = 'RI' OR location LIKE 'RI %' OR location LIKE '% RI' OR location LIKE '% RI %' \
                  OR location = 'Rhode Island' OR location LIKE 'Rhode Island %' OR location LIKE '% Rhode Island' OR location LIKE '% Rhode Island %' \
                  OR location = 'SC' OR location LIKE 'SC %' OR location LIKE '% SC' OR location LIKE '% SC %' \
                  OR location = 'South Carolina' OR location LIKE 'South Carolina %' OR location LIKE '% South Carolina' OR location LIKE '% South Carolina %' \
                  OR location = 'SD' OR location LIKE 'SD %' OR location LIKE '% SD' OR location LIKE '% SD %' \
                  OR location = 'South Dakota' OR location LIKE 'South Dakota %' OR location LIKE '% South Dakota' OR location LIKE '% South Dakota %' \
                  OR location = 'TN' OR location LIKE 'TN %' OR location LIKE '% TN' OR location LIKE '% TN %' \
                  OR location = 'Tennessee' OR location LIKE 'Tennessee %' OR location LIKE '% Tennessee' OR location LIKE '% Tennessee %' \
                  OR location = 'TX' OR location LIKE 'TX %' OR location LIKE '% TX' OR location LIKE '% TX %' \
                  OR location = 'Texas' OR location LIKE 'Texas %' OR location LIKE '% Texas' OR location LIKE '% Texas %' \
                  OR location = 'UT' OR location LIKE 'UT %' OR location LIKE '% UT' OR location LIKE '% UT %' \
                  OR location = 'Utah' OR location LIKE 'Utah %' OR location LIKE '% Utah' OR location LIKE '% Utah %' \
                  OR location = 'VT' OR location LIKE 'VT %' OR location LIKE '% VT' OR location LIKE '% VT %' \
                  OR location = 'Vermont' OR location LIKE 'Vermont %' OR location LIKE '% Vermont' OR location LIKE '% Vermont %' \
                  OR location = 'VA' OR location LIKE 'VA %' OR location LIKE '% VA' OR location LIKE '% VA %' \
                  OR location = 'Virginia' OR location LIKE 'Virginia %' OR location LIKE '% Virginia' OR location LIKE '% Virginia %' \
                  OR location = 'WA' OR location LIKE 'WA %' OR location LIKE '% WA' OR location LIKE '% WA %' \
                  OR location = 'Washington' OR location LIKE 'Washington %' OR location LIKE '% Washington' OR location LIKE '% Washington %' \
                  OR location = 'WV' OR location LIKE 'WV %' OR location LIKE '% WV' OR location LIKE '% WV %' \
                  OR location = 'West Virginia' OR location LIKE 'West Virginia %' OR location LIKE '% West Virginia' OR location LIKE '% West Virginia %' \
                  OR location = 'WI' OR location LIKE 'WI %' OR location LIKE '% WI' OR location LIKE '% WI %' \
                  OR location = 'Wisconsin' OR location LIKE 'Wisconsin %' OR location LIKE '% Wisconsin' OR location LIKE '% Wisconsin %' \
                  OR location = 'WY' OR location LIKE 'WY %' OR location LIKE '% WY' OR location LIKE '% WY %' \
                  OR location = 'Wyoming' OR location LIKE 'Wyoming %' OR location LIKE '% Wyoming' OR location LIKE '% Wyoming %'")
                                         
    #option("header", "True"). \
    #Write the spark stream
    writeTweet = text_user_info.writeStream. \
        format("csv"). \
        option("checkpointLocation", "./storage_johnson&johnson_US/"). \
        option("path", "./storage_johnson&johnson_US/"). \
        outputMode("append"). \
        queryName("US_jnjohnson_tweets"). \
        trigger(processingTime='50 seconds'). \
        start()
    
    print("----- streaming is running -------")
    
    writeTweet.awaitTermination()
    
except: 
    print("Unexpected error:", sys.exc_info())

----- streaming is running -------
Unexpected error: (<class 'KeyboardInterrupt'>, KeyboardInterrupt(), <traceback object at 0x7fa928bf7b80>)


In [None]:
spark.stop()