# Requeriments

# Data Analysis

## Libraries

In [1]:
import pandas as pd
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_utc_timestamp, udf, array_distinct, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover
from tqdm.notebook import tqdm
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

2023-05-10 21:14:04.656134: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-10 21:14:09.931514: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-10 21:14:09.931912: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


### Imports with downloads

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import string
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /home/hduser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Warnings conf

In [3]:
import warnings

# Suppressing the warnings
warnings.filterwarnings('ignore') 

## Import Data

In [4]:
#reading the data on Hadoop

spark = SparkSession.builder.appName("HadoopAccess").getOrCreate()

2023-05-10 21:14:16,758 WARN util.Utils: Your hostname, BDS-2023 resolves to a loopback address: 127.0.1.1; using 192.168.0.110 instead (on interface wlo1)
2023-05-10 21:14:16,761 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-05-10 21:14:18,158 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## ML on Spark

### Creating a classifier model using training dataset

#### Getting the dataset

Dataset available [link](https://github.com/ardianumam/compilations/blob/master/ApacheSparkVideoSeries/dataset/README.md)

In [5]:
#read csv file
tweets_csv = spark.read.csv('/CA4/tweets/training_database/tweets.csv', inferSchema=True, header=True)
tweets_csv = tweets_csv.select(col("SentimentText").alias("text"), col("Sentiment").cast("Int").alias("label"))
tweets_csv.show(n=3)

                                                                                

+--------------------+-----+
|                text|label|
+--------------------+-----+
|that film is fant...|    1|
|this music is rea...|    1|
|winter is terribl...|    0|
+--------------------+-----+
only showing top 3 rows



#### Dividing data

In [6]:
#80% training, 20% testing
dividedData = tweets_csv.randomSplit([0.8, 0.2]) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
print ("Training:", trainingData.count(), "; Testing:", testingData.count())

Training: 1557 ; Testing: 375


#### Preparing Training Data

In [7]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenizedTrain = tokenizer.transform(trainingData)


swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)


hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")  
numericTrainData = hashTF.transform(SwRemovedTrain).select(
    'label', 'MeaningfulWords', 'features')

numericTrainData.show(n=3)

+-----+--------------------+--------------------+
|label|     MeaningfulWords|            features|
+-----+--------------------+--------------------+
|    1|[adore, cheese, #...|(262144,[1689,910...|
|    1|[adore, cheese, #...|(262144,[1689,100...|
|    1|[adore, cheese, #...|(262144,[1689,100...|
+-----+--------------------+--------------------+
only showing top 3 rows



#### Training the model

In [8]:
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)
model = lr.fit(numericTrainData)
print ("Training is done!")

2023-05-10 21:14:35,505 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2023-05-10 21:14:35,505 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Training is done!


#### Preparing Testing data

In [9]:
tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select(
    'Label', 'MeaningfulWords', 'features')
numericTest.show(truncate=False, n=2)

+-----+---------------------------+------------------------------------------+
|Label|MeaningfulWords            |features                                  |
+-----+---------------------------+------------------------------------------+
|1    |[adore, cheese, #brilliant]|(262144,[1689,45361,100089],[1.0,1.0,1.0])|
|1    |[adore, cheese, #thumbs-up]|(262144,[1689,88825,100089],[1.0,1.0,1.0])|
+-----+---------------------------+------------------------------------------+
only showing top 2 rows



#### Predicting testing data

In [10]:
prediction = model.transform(numericTest)

predictionFinal = prediction.select(
    "MeaningfulWords", "prediction", "Label")

predictionFinal.show(n=4, truncate = False)

correctPrediction = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['Label']).count()

totalData = predictionFinal.count()

print("Correct prediction:", correctPrediction) 
print("Total data:", totalData)
print(f"Accuracy: {correctPrediction/totalData:.4}")

+-------------------------------------+----------+-----+
|MeaningfulWords                      |prediction|Label|
+-------------------------------------+----------+-----+
|[adore, cheese, #brilliant]          |1.0       |1    |
|[adore, cheese, #thumbs-up]          |1.0       |1    |
|[adore, classical, music, #brilliant]|1.0       |1    |
|[adore, jam, #toptastic]             |1.0       |1    |
+-------------------------------------+----------+-----+
only showing top 4 rows

Correct prediction: 370
Total data: 375
Accuracy: 0.9867


### Processing data collected

In [11]:
#Setting timestamp
df = spark.read.parquet("/CA4/tweets/*.parquet")
df = df.select("created_at", "month", "text", "entities")
df = df.withColumn("created_at", from_utc_timestamp(df["created_at"], "UTC"))
df.show(truncate=True, n=3)

[Stage 31:>                                                         (0 + 1) / 1]

+-------------------+-----+--------------------+--------------------+
|         created_at|month|                text|            entities|
+-------------------+-----+--------------------+--------------------+
|2022-01-01 23:51:54|    1|RT @ampahcd: @Zac...|"{\"hashtags\": [...|
|2022-01-01 23:41:24|    1|RT @Rina_The_Espe...|"{\"hashtags\": [...|
|2022-01-01 23:41:28|    1|@VVitchStreams @R...|"{\"hashtags\": [...|
+-------------------+-----+--------------------+--------------------+
only showing top 3 rows



                                                                                

#### Cleaning Data

In [12]:
from pyspark.sql.functions import regexp_replace

at_regex = r"@\w+" #Remove usernames
link_regex = r"http\S+" #Remove links
rt_regex = r'\bRT\b' #Remove 'RT'
ss_regex = r'[^\w\s]' #Remove Special strings
ds_regex = r'\s+' #remove spaces

tweets = df.withColumn("cleaned_text", regexp_replace("text", at_regex, "").alias("text_without_at_signs")) \
    .withColumn("cleaned_text", regexp_replace("cleaned_text", link_regex, "").alias("text_without_links")) \
    .withColumn("cleaned_text", regexp_replace("cleaned_text", rt_regex, "").alias("text_without_regex")) \
    .withColumn("cleaned_text", regexp_replace("cleaned_text", ss_regex, "").alias("text_without_regex")) \
    .withColumn("cleaned_text", regexp_replace("cleaned_text", ds_regex, " ").alias("text_without_regex"))

#tweets = tweets.select("cleaned_text")

tweets.show()


+-------------------+-----+--------------------+--------------------+--------------------+
|         created_at|month|                text|            entities|        cleaned_text|
+-------------------+-----+--------------------+--------------------+--------------------+
|2022-01-01 23:51:54|    1|RT @ampahcd: @Zac...|"{\"hashtags\": [...| We are blowing l...|
|2022-01-01 23:41:24|    1|RT @Rina_The_Espe...|"{\"hashtags\": [...| Vaccine aparthei...|
|2022-01-01 23:41:28|    1|@VVitchStreams @R...|"{\"hashtags\": [...| You have no prob...|
|2022-01-01 23:52:58|    1|RT @drmeenalviz: ...|"{\"hashtags\": [...| To round off 202...|
|2022-01-01 23:53:11|    1|RT @JacobEdwardIn...|"{\"hashtags\": [...| Im Covid positiv...|
|2022-01-01 23:53:30|    1|RT @LakotaMan1: I...|"{\"hashtags\": [...| If youre protest...|
|2022-01-01 23:08:54|    1|RT @luigi_warren:...|"{\"hashtags\": [...| So three debacle...|
|2022-01-01 23:37:00|    1|@ruiz20059 No.  W...|"{\"hashtags\": [...| No What religion...|

#### Preparing Data

In [13]:
tokenizer = Tokenizer(inputCol="cleaned_text", outputCol="words")
tokenizedData = tokenizer.transform(tweets)

swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemoved = swr.transform(tokenizedData)

hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericData = hashTF.transform(SwRemoved).select('MeaningfulWords', 'features')


numericData.show(n=3)

+--------------------+--------------------+
|     MeaningfulWords|            features|
+--------------------+--------------------+
|[, blowing, large...|(262144,[3928,510...|
|[, vaccine, apart...|(262144,[32890,57...|
|[, problem, injec...|(262144,[31536,76...|
+--------------------+--------------------+
only showing top 3 rows



#### Predicting Data

In [14]:
prediction = model.transform(numericData)

predictionFinal = prediction.select(
    "MeaningfulWords", "prediction")

In [15]:
predictionFinal.show(truncate = False, n=20)

+----------------------------------------------------------------------------------------------------------------------------+----------+
|MeaningfulWords                                                                                                             |prediction|
+----------------------------------------------------------------------------------------------------------------------------+----------+
|[, blowing, large, holes, entire, pandemic, vaccination, agendabig, pharma]                                                 |0.0       |
|[, vaccine, apartheid, actually, exists, even, imperial, core, keep, mind]                                                  |0.0       |
|[, problem, injected, completely, unproven, vaccine, still, go]                                                             |0.0       |
|[, round, 2021, mum, bumped, old, friend, street, told, us, wouldnt, take, vaccine, amp, im]                                |0.0       |
|[, im, covid, positive, receiving

#### Join Prediction with Data 

In [16]:
predictionFinal.count()

763266

In [20]:
from pyspark.sql.functions import monotonically_increasing_id

# Create a column with id following the data's order 
tweets = tweets.withColumn("row_id", monotonically_increasing_id())
predictionFinal = predictionFinal.withColumn("row_id", monotonically_increasing_id())

# join by "row_id"
tweets_pred = tweets.select('row_id','created_at', 'text', 'cleaned_text', 'month') \
                .join(predictionFinal.select('row_id', 'prediction'), "row_id", "inner")

# drop column 
tweets_pred = tweets_pred.drop("row_id")

tweets_pred.show()



+-------------------+--------------------+--------------------+-----+----------+
|         created_at|                text|        cleaned_text|month|prediction|
+-------------------+--------------------+--------------------+-----+----------+
|2022-01-01 23:51:54|RT @ampahcd: @Zac...| We are blowing l...|    1|       0.0|
|2022-01-01 23:37:00|@ruiz20059 No.  W...| No What religion...|    1|       0.0|
|2022-01-01 23:59:51|RT @WSJ: Internat...| International tr...|    1|       0.0|
|2022-01-01 23:45:39|RT @toadmeister: ...| A major study fr...|    1|       0.0|
|2022-01-01 23:46:11|@doctor_oxford Nu...| Nurses are too b...|    1|       0.0|
|2022-01-01 23:46:38|RT @Madisontx76: ...| Why do Democrats...|    1|       0.0|
|2022-01-01 23:29:36|RT @JasonLehn: If...| If you think hav...|    1|       1.0|
|2022-01-01 23:12:57|RT @cooperlund: I...| It may seem like...|    1|       1.0|
|2022-01-01 23:31:00|@7NewsSydney She ...| She doesnt seem ...|    1|       0.0|
|2022-01-01 23:49:33|RT @ged

                                                                                

### Textblob and Varder

In [21]:
#Functions
def f_textblob(text):
    return TextBlob(text).sentiment.polarity

def f_vader(text):
    return SentimentIntensityAnalyzer().polarity_scores(text)['compound']


#UDFs
udf_textblob = udf(f_textblob, StringType())

udf_vader = udf(f_vader, StringType())


#applying to Dataframe
tweets_pred = tweets_pred.withColumn("textblob", udf_textblob(tweets_pred["cleaned_text"])) \
                         .withColumn("vader", udf_vader(tweets_pred["cleaned_text"]))

#tweets_pred_2.show(n=3)

In [23]:
%%time

tweets_pred.write.partitionBy("month").parquet("/CA4/predictions")

                                                                                

CPU times: user 224 ms, sys: 156 ms, total: 380 ms
Wall time: 24min 21s


In [25]:
tweets_pred.show(2)

[Stage 57:>                                                         (0 + 1) / 1]

+-------------------+--------------------+--------------------+-----+----------+-------------------+------+
|         created_at|                text|        cleaned_text|month|prediction|           textblob| vader|
+-------------------+--------------------+--------------------+-----+----------+-------------------+------+
|2022-01-01 23:51:54|RT @ampahcd: @Zac...| We are blowing l...|    1|       0.0|0.10714285714285714|   0.0|
|2022-01-01 23:37:00|@ruiz20059 No.  W...| No What religion...|    1|       0.0|                0.0|-0.296|
+-------------------+--------------------+--------------------+-----+----------+-------------------+------+
only showing top 2 rows



2023-05-10 21:55:18,231 WARN python.PythonUDFRunner: Detected deadlock while completing task 0.0 in stage 57 (TID 115): Attempting to kill Python Worker
                                                                                

### BERT

In [24]:
classifier_sa = pipeline("sentiment-analysis")
#https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [26]:
def f_classifier(text):
    return 1 if classifier_sa(text)[0]['label'] == 'POSITIVE' else 0


#UDFs
udf_classifier = udf(f_classifier, StringType())

#applying to Dataframe
tweets_pred = tweets_pred.withColumn("classifier", udf_classifier(tweets_pred["cleaned_text"]))

In [27]:
tweets_pred.show(2)

2023-05-10 21:58:48.261706: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-10 21:58:49.058649: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-10 21:58:49.058718: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
[Stage 62:>                                                         (0 + 1) / 1]

+-------------------+--------------------+--------------------+-----+----------+-------------------+------+----------+
|         created_at|                text|        cleaned_text|month|prediction|           textblob| vader|classifier|
+-------------------+--------------------+--------------------+-----+----------+-------------------+------+----------+
|2022-01-01 23:51:54|RT @ampahcd: @Zac...| We are blowing l...|    1|       0.0|0.10714285714285714|   0.0|         0|
|2022-01-01 23:37:00|@ruiz20059 No.  W...| No What religion...|    1|       0.0|                0.0|-0.296|         0|
+-------------------+--------------------+--------------------+-----+----------+-------------------+------+----------+
only showing top 2 rows



                                                                                

In [32]:

@delayed
def text_blob_polarity(text):
    return 2 if TextBlob(text).sentiment.polarity > 0 else 0

@delayed
def vader_polarity(text):
    return 2 if SentimentIntensityAnalyzer().polarity_scores(text)['compound'] > 0 else 0

@delayed
def classifier_polarity(text):
    return 1 if classifier_sa(text)[0]['label'] == 'POSITIVE' else 0

@delayed
def calculate_score(t,v,c,p):
    return 1 if ((t + v + c + p)/4) >= 1 else 0



In [34]:
tweets_dd['textBlob'] = tweets_dd['cleaned_text'].map(lambda text: text_blob_polarity(text))
tweets_dd['vader'] = tweets_dd['cleaned_text'].map(lambda text: vader_polarity(text))
tweets_dd['classifier'] = tweets_dd['cleaned_text'].map(lambda text: classifier_polarity(text))


In [None]:
tweets_dd.compute() 

In [44]:
df = tweets_pd

def calculate_score(row):
    return 1 if ((row['classifier'] + row['vader'] + row['textBlob'] + row['prediction'])/4) >= 1 else 0

In [45]:
df['textBlob'] = df['cleaned_text'].apply(lambda x: 2 if TextBlob(x).sentiment.polarity > 0 else 0)

In [None]:
df['vader'] = df['cleaned_text'].apply(lambda x: 2 if SentimentIntensityAnalyzer().polarity_scores(x)['compound'] > 0 else 0)

In [None]:
df['classifier'] = df['cleaned_text'].apply(lambda x: 1 if classifier_sa(x)[0]['label'] == 'POSITIVE' else 0)

In [None]:
df['score'] = df.apply(calculate_score)

In [47]:
df.sample(15)

Unnamed: 0,created_at,text,cleaned_text,prediction,textBlob,vader,classifier,score
249178,2022-03-22 23:08:55,RT @freethought202: 1. “Given the evidence of ...,1 Given the evidence of white cell depletion ...,0.0,0,,,
449573,2022-06-07 22:07:50,"RT @USMortality: So the FDA members, stated, t...",So the FDA members stated that they are autho...,0.0,0,,,
151726,2022-02-27 15:37:22,I hope vaccines are not mandatory by the time ...,I hope vaccines are not mandatory by the time ...,0.0,0,,,
725585,2022-06-10 18:08:32,The Biden administration is preparing to distr...,The Biden administration is preparing to distr...,0.0,0,,,
704050,2022-02-09 00:26:40,This isn’t okay. Talking about a vaccine is. \...,This isnt okay Talking about a vaccine is Chan...,0.0,2,,,
558222,2022-09-11 20:25:58,RT @truedevonthomps: I’ve made up my mind. Tho...,Ive made up my mind Those still advocating fo...,0.0,0,,,
625247,2022-10-20 03:39:08,"RT @leezeldin: As Governor, I will oppose mand...",As Governor I will oppose mandating the COVID...,0.0,0,,,
641925,2022-08-01 05:33:33,@MAGA_VIBES @FoxNews The freedom convoy agains...,The freedom convoy against the Mandatory Vacc...,0.0,0,,,
337172,2022-02-15 23:18:23,RT @AndrewLawton: As more provinces do away wi...,As more provinces do away with vaccine passpo...,0.0,2,,,
442737,2022-10-26 22:08:36,RT @JackPosobiec: THE VACCINES DO NOT STOP THE...,THE VACCINES DO NOT STOP THE SPREAD,0.0,0,,,


#### Accessing Metadata