In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.7.1

# Install demoji for emoji removal
! pip install demoji

import demoji
demoji.download_codes()

openjdk version "1.8.0_282"
OpenJDK Runtime Environment (build 1.8.0_282-8u282-b08-0ubuntu1~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.282-b08, mixed mode)
[K     |████████████████████████████████| 215.7MB 71kB/s 
[K     |████████████████████████████████| 204kB 47.8MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 143kB 12.7MB/s 
[?25hCollecting demoji
  Downloading https://files.pythonhosted.org/packages/88/6a/34379abe01c9c36fe9fddc4181dd935332e7d0159ec3fae76f712e49bcea/demoji-0.4.0-py2.py3-none-any.whl
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: colorama, demoji
Successfully installed colorama-0.4.4 demoji-0.4.0
Downloading emoji data ...
... OK (Got response in 0.43 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [3]:
import sparknlp
import re

data_dir = os.path.join(os.getcwd(),'gdrive','My Drive','TwitterData')

file_path = os.path.join(data_dir,'Biden-Pressconference.csv')

spark = sparknlp.start() 

df = spark.read.csv(file_path,inferSchema=True,header=True,multiLine=True,escape='"')

In [6]:
df.printSchema()

root
 |-- tweet_id: long (nullable = true)
 |-- full_text: string (nullable = true)



In [4]:
from sparknlp.base import DocumentAssembler,Finisher, TokenAssembler
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *

from pyspark.ml import Pipeline
from pyspark.sql.functions import col, concat_ws
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [5]:
@udf(returnType=StringType()) 
def remove_url_emojis(text):
  text1 = re.sub(r"http\S+", "", text)
  return demoji.replace(text1, "")

In [7]:
df = df.withColumn("text",remove_url_emojis(col("full_text")))
df.select("text").show(5, truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                  |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|We obtained EXCLUSIVE FOOTAGE of Biden preparing for his press conference. #CrowderBidenStream #bidenpressconference                                                                                        

In [8]:
pipeline = PretrainedPipeline("analyze_sentimentdl_use_twitter", lang="en")

analyze_sentimentdl_use_twitter download started this may take some time.
Approx size to download 935.1 MB
[OK!]


In [9]:
df_result = pipeline.transform(df)

In [10]:
df_result.printSchema()

root
 |-- tweet_id: long (nullable = true)
 |-- full_text: string (nullable = true)
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence_embeddings: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 

In [11]:
df_result.selectExpr("explode(sentiment) sentiments").show(5,truncate=False)

+-------------------------------------------------------------------------------------------------+
|sentiments                                                                                       |
+-------------------------------------------------------------------------------------------------+
|[category, 0, 116, positive, [sentence -> 0, positive -> 1.0, negative -> 0.0], []]              |
|[category, 0, 83, negative, [sentence -> 0, positive -> 2.598832E-4, negative -> 0.99974006], []]|
|[category, 0, 115, negative, [sentence -> 0, positive -> 4.6696863E-15, negative -> 1.0], []]    |
|[category, 0, 261, negative, [sentence -> 0, positive -> 0.0867686, negative -> 0.91323143], []] |
|[category, 0, 125, negative, [sentence -> 0, positive -> 0.040200155, negative -> 0.9597998], []]|
+-------------------------------------------------------------------------------------------------+
only showing top 5 rows



In [12]:
df_result.selectExpr("explode(sentiment) sentiments").selectExpr("sentiments.result result").show(5,truncate=False)

+--------+
|result  |
+--------+
|positive|
|negative|
|negative|
|negative|
|negative|
+--------+
only showing top 5 rows



In [35]:
df_final_result = df_result.selectExpr("tweet_id","explode(sentiment) sentiments").selectExpr("sentiments.result","sentiments.metadata['positive'] positive_confidence","sentiments.metadata['negative'] negative_confidence").toPandas()
df_final_result.head()

Unnamed: 0,result,positive_confidence,negative_confidence
0,positive,1.0,0.0
1,negative,0.0002598832,0.99974006
2,negative,4.6696863e-15,1.0
3,negative,0.0867686,0.91323143
4,negative,0.040200155,0.9597998


In [36]:
result_path = os.path.join(data_dir,'Biden-Pressconference-Sentiment-Results.csv')

In [38]:
df_final_result.shape

(35574, 3)

In [40]:
df.printSchema()

root
 |-- tweet_id: long (nullable = true)
 |-- full_text: string (nullable = true)
 |-- text: string (nullable = true)



In [42]:
import pandas as pd

df_pandas = df.select("tweet_id","full_text","text").toPandas()

df_final = pd.concat([df_pandas,df_final_result], axis =1)

df_final.head()

Unnamed: 0,tweet_id,full_text,text,result,positive_confidence,negative_confidence
0,1375177619155644416,We obtained EXCLUSIVE FOOTAGE of Biden prepari...,We obtained EXCLUSIVE FOOTAGE of Biden prepari...,positive,1.0,0.0
1,1375177617943449610,#bidenpressconference \n\nWaiting for #Ageism ...,#bidenpressconference \n\nWaiting for #Ageism ...,negative,0.0002598832,0.99974006
2,1375177610536366081,Is no one going to talk about how @potus biden...,Is no one going to talk about how @potus biden...,negative,4.6696863e-15,1.0
3,1375177610192420873,Celebrating the fact that during his first pre...,Celebrating the fact that during his first pre...,negative,0.0867686,0.91323143
4,1375177609852690433,A President with cognitive decline is a nation...,A President with cognitive decline is a nation...,negative,0.040200155,0.9597998


In [43]:
df_final.to_csv(result_path,index=False)