In [1]:
import re
import numpy as np
from pyspark.sql import functions as F
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

# You need to load twitter data

In [2]:
df = spark.read.csv("data\\tweets.csv",header=True,inferSchema=True )

In [3]:
# Remove the special chars. Only lettere will reamin.
df = df.withColumn("text_c", F.regexp_replace(F.col("text"), "[^a-zA-Z ]", ""));

In [4]:
df.show(20)

+--------------------+--------------------+
|                text|              text_c|
+--------------------+--------------------+
|This article also...|This article also...|
|Are long covid su...|Are long covid su...|
|Are long covid su...|Are long covid su...|
|A combination of ...|A combination of ...|
|Utter rubbish - m...|Utter rubbish  m ...|
|When long covid f...|When long covid f...|
|Pretty sure the p...|Pretty sure the p...|
|Ive had this ling...|Ive had this ling...|
|Less risk of deat...|Less risk of deat...|
|Not sure why you ...|Not sure why you ...|
|"Get vaxxed!!!The...|Get vaxxedThe eff...|
|Cognitive Rehab: ...|Cognitive Rehab O...|
|"They call being ...|They call being f...|
|Incredibly unlike...|Incredibly unlike...|
|Have you heard of...|Have you heard of...|
|"Between -30% of ...|Between  of COVID...|
|And almost no sid...|And almost no sid...|
|Are you seriously...|Are you seriously...|
|You cant know the...|You cant know the...|
|They only look at...|They only 

In [5]:
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.ml.clustering import LDA

In [6]:
# Text preprocessin pipeline
tokenizer = Tokenizer(inputCol="text_c", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
# countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features", vocabSize=500)
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features", vocabSize=500,minDF=10, maxDF=20000)

pipeline = Pipeline(stages=[tokenizer,remover, countVectorizer])
data_model = pipeline.fit(df)

In [7]:
vocabulary = data_model.stages[2].vocabulary
print(vocabulary[:100])

['amp', 'symptoms', 'dont', 'children', 'kids', 'many', 'know', 'like', 'still', 'im', 'also', 'risk', 'one', 'vaccine', 'even', 'health', 'months', 'vaccinated', 'getting', 'effects', 'cases', 'longcovid', 'think', 'infection', 'death', 'got', 'study', 'need', 'much', 'suffering', 'us', 'deaths', 'virus', 'thats', 'patients', 'well', 'may', 'year', 'time', 'new', 'really', 'work', 'want', 'vaccines', 'term', 'going', 'doesnt', 'ive', 'damage', 'good', 'die', 'help', 'cant', 'see', 'died', 'life', 'care', 'take', 'better', 'sick', 'data', 'illness', 'youre', 'last', 'less', 'uk', 'go', 'back', 'issues', 'isnt', 'yes', 'say', 'likely', 'disease', 'thing', 'years', 'infected', 'hope', 'mild', 'research', 'suffer', 'brain', 'ill', 'real', 'way', 'never', 'day', 'since', 'fatigue', 'hospital', 'vaccination', 'lot', 'theres', 'weeks', 'immunity', 'yet', 'school', 'first', 'right', 'dying']


In [8]:
dataset = data_model.transform(df)
dataset.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|              text_c|               words|            filtered|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|This article also...|This article also...|[this, article, a...|[article, also, s...|(500,[10,16,19,40...|
|Are long covid su...|Are long covid su...|[are, long, covid...|[long, covid, suf...|(500,[2,14,15,50,...|
|Are long covid su...|Are long covid su...|[are, long, covid...|[long, covid, suf...|(500,[2,14,15,50,...|
|A combination of ...|A combination of ...|[a, combination, ...|[combination, sti...|(500,[0,8,14,82,2...|
|Utter rubbish - m...|Utter rubbish  m ...|[utter, rubbish, ...|[utter, rubbish, ...|(500,[10,20,24,31...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [9]:
lda = LDA(k=5, maxIter=10)
model = lda.fit(dataset)

In [10]:
model.topicsMatrix()

DenseMatrix(500, 5, [118.0321, 1058.7327, 296.776, 98.2211, 95.1265, 542.7731, 515.5355, 340.0526, ..., 17.7943, 11.5801, 13.9043, 102.1744, 19.846, 41.5321, 50.4078, 84.4006], 0)

In [11]:
# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+------------------------------------------------------------------+
|topic|termIndices|termWeights                                                       |
+-----+-----------+------------------------------------------------------------------+
|0    |[12, 9, 21]|[0.032417131327915294, 0.024645812111262447, 0.018224085444075482]|
|1    |[1, 3, 10] |[0.03682495745380505, 0.019946832286835466, 0.01935900148826528]  |
|2    |[4, 3, 6]  |[0.03878542469288138, 0.029372596385178747, 0.019430446212653472] |
|3    |[7, 2, 9]  |[0.022667632485560987, 0.01870948448563872, 0.016522576367774842] |
|4    |[0, 5, 31] |[0.05627081078650233, 0.034481756970852415, 0.017397646382846868] |
+-----+-----------+------------------------------------------------------------------+



In [12]:
# Print most important topic per category
topics = model.describeTopics(10)
for r in topics.select("termIndices").collect():
    rez = []
    for l in r:
        for i in l:
            rez.append(vocabulary[i])
    print(rez[:10])

['one', 'im', 'longcovid', 'infection', 'symptoms', 'death', 'fatigue', 'bad', 'year', 'syndrome']
['symptoms', 'children', 'also', 'study', 'know', 'dont', 'think', 'months', 'vaccine', 'got']
['kids', 'children', 'know', 'risk', 'still', 'even', 'damage', 'dont', 'effects', 'thats']
['like', 'dont', 'im', 'still', 'getting', 'want', 'need', 'take', 'ive', 'time']
['amp', 'many', 'deaths', 'cases', 'health', 'longcovid', 'us', 'thousands', 'virus', 'life']


In [13]:
topic_name = ["peoples","children","vaccinated","fatal",""]

In [14]:
# Shows the result
transformed = model.transform(dataset)
transformed.select("text_c","topicDistribution").show(5)

+--------------------+--------------------+
|              text_c|   topicDistribution|
+--------------------+--------------------+
|This article also...|[0.01948154677057...|
|Are long covid su...|[0.47114690695234...|
|Are long covid su...|[0.47114690695234...|
|A combination of ...|[0.03206051277963...|
|Utter rubbish  m ...|[0.28458044019144...|
+--------------------+--------------------+
only showing top 5 rows



In [15]:
from pyspark.sql.functions import udf
@udf
def vect_argmax(row):
    row_arr = row.toArray()
    max_pos = np.argmax(row_arr)
    return(int(max_pos))
transformed1 = transformed.withColumn("argmax",vect_argmax(F.col('topicDistribution')))

In [16]:
transformed1.select("text_c","argmax").show(5, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|text_c                                                                                                                                                                                                                                           |argmax|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|This article also states they consider long Covid up to months there really isnt research of long term effects as its been around                                                                                                                |1   

In [17]:
# Code for preprocessing the tweets. Not in Spark.
# import pandas as pd

# dataset = pd.read_csv("data\\en_tweets_with_phenotype_counts_new.csv")

# import preprocessor as p
# tw = []
# for s in df["content"].head(200000):
#     tw.append(p.clean(s))
# dft = pd.DataFrame(columns=["text"])
# dft["text"] = tw
# dft.to_csv("data\\tweets.csv",index=False)