## Spark setting

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz 
!tar xf spark-3.0.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [191]:
!pip install pyspark



In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [193]:
spark

## Kaggle APIs

________________________________________________________________________

In [None]:
from google.colab import files
files.upload() #upload kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"federicofiorio","key":"3c9fda0dd26cc20d5f652a577743142c"}'}

In [None]:
! pip install kaggle



In [None]:
! mkdir ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
!ls ~/.kaggle

kaggle.json


In [None]:
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows

Downloading ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip to /content
100% 7.05G/7.06G [00:51<00:00, 121MB/s]
100% 7.06G/7.06G [00:51<00:00, 148MB/s]


In [None]:
!ls

kaggle.json
sample_data
spark-3.0.3-bin-hadoop2.7
spark-3.0.3-bin-hadoop2.7.tgz
ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip


In [None]:
!unzip ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip

Archive:  ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip
  inflating: 0401_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0402_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0403_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0404_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0405_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0406_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0407_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0408_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0409_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0410_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0411_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0412_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0413_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0414_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0415_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0416_UkraineCombinedTweetsDeduped.csv.gzip  
  inflat

## Use the data

________________________________________________________________________

In [None]:
import numpy as np
import pandas as pd
import csv
import os
import warnings

In [None]:
filename = r"0401_UkraineCombinedTweetsDeduped.csv.gzip"
df = pd.read_csv(filename, compression='gzip', index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df[0:2].T

Unnamed: 0,0,1
userid,16882774,3205296069
username,Yaniela,gregffff
acctdesc,"Animal lover, supports those who fight injusti...",
location,Hawaii,
following,1158,122
followers,392,881
totaltweets,88366,99853
usercreatedts,2008-10-21 07:34:04.000000,2015-04-25 11:24:34.000000
tweetid,1509681950042198030,1509681950151348229
tweetcreatedts,2022-04-01 00:00:00.000000,2022-04-01 00:00:00.000000


In [None]:
type(df)

pandas.core.frame.DataFrame

In [None]:
df = df[['tweetid', 'text', 'hashtags']] #keep only 3 columns

In [None]:
from pyspark.sql.types import *

schema = StructType([StructField("tweetid", StringType(), True)\
                   ,StructField("text", StringType(), True)\
                   ,StructField("hashtags", StringType(), True)])

#create spark dataframe using schema
df_spark = spark.createDataFrame(df,schema=schema)

In [None]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [None]:
df_spark.show()

+-------------------+--------------------+--------------------+
|            tweetid|                text|            hashtags|
+-------------------+--------------------+--------------------+
|1509681950042198030|⚡The Ukrainian Ai...|                  []|
|1509681950151348229|Chernihiv oblast....|[{'text': 'russia...|
|1509681950683926556|America 🇺🇸 is p...|[{'text': 'Russia...|
|1509681951116046336|JUST IN: #Anonymo...|[{'text': 'Anonym...|
|1509681951304990720|***PUBLIC MINT NO...|                  []|
|1509681952000937999|The Amazing story...|[{'text': 'Russia...|
|1509681952978210849|&amp;quot;How we ...|                  []|
|1509681953053843466|India's purchase ...|[{'text': 'Russia...|
|1509681953091457035|The most basic te...|[{'text': 'Ukrain...|
|1509681953418711050|"The image that R...|[{'text': 'Putin'...|
|1509681953418752008|#Russia’s Preside...|[{'text': 'Russia...|
|1509681953636700160|Sad to read of th...|[{'text': 'Russia...|
|1509681953947074560|الرئيس الروسي يصع...|

In [None]:
rdd = df_spark.rdd #to get the rdd from dataframe
rdd.take(10)

rdd_hashtags = rdd.map(lambda x : (x[0], x[2]))
#if we want to have tweet and hashtags it's already done, if we want to have tweetid and words it needs to be processed
#also some words in the text field are hashtags that twitter didn't process right, we should preprocess these data too

rdd_hashtags.take(10)

[('1509681950042198030', '[]'),
 ('1509681950151348229',
  "[{'text': 'russianinvasion', 'indices': [77, 93]}, {'text': 'StandWithUkraine️', 'indices': [97, 115]}, {'text': 'UkraineUnderAttack', 'indices': [116, 135]}]"),
 ('1509681950683926556',
  "[{'text': 'RussianUkrainianWar', 'indices': [74, 94]}]"),
 ('1509681951116046336', "[{'text': 'Anonymous', 'indices': [25, 35]}]"),
 ('1509681951304990720', '[]'),
 ('1509681952000937999',
  "[{'text': 'Russia', 'indices': [208, 215]}, {'text': 'Ukraine', 'indices': [216, 224]}, {'text': 'motivation', 'indices': [225, 236]}, {'text': 'netDE', 'indices': [237, 243]}, {'text': 'eduDE', 'indices': [244, 250]}, {'text': 'Delaware', 'indices': [251, 260]}, {'text': 'government', 'indices': [261, 272]}, {'text': 'USA', 'indices': [273, 277]}]"),
 ('1509681952978210849', '[]'),
 ('1509681953053843466',
  "[{'text': 'Russian', 'indices': [31, 39]}, {'text': 'Moscow', 'indices': [161, 168]}]"),
 ('1509681953091457035', "[{'text': 'Ukraine', 'indices

In [None]:
#take only english texts
rdd_text =  rdd.map(lambda x : (x[0], x[1]))
rdd_text.take(10)

[('1509681950042198030',
  '⚡The Ukrainian Air Force would like to address misinformation published in multiple Western media outlets regarding the situation in the 🇺🇦 sky and support from our @NATO allies. \nMore in 🧵(1/16)\n#ProtectUАSky #StopRussia #UkraineUnderAttaсk'),
 ('1509681950151348229',
  'Chernihiv oblast. Ukrainians welcome their liberators from #russianinvasion. \n\n#StandWithUkraine️ #UkraineUnderAttack #UkraineWillWin #PutinIsaWarCriminal #StopPutin #RussianUkrainianWar #RussiaGoHome #РоссияСмотри #нетвойне https://t.co/86ixYuEtNb'),
 ('1509681950683926556',
  "America 🇺🇸 is preparing for something worse than the #RussianUkrainianWar...Taiwan 🇹🇼 Last month we entered into a new cold war with Russia 🇷🇺 what's the best policy for the future of 🇺🇸🇨🇳 relations over Taiwan? \n\nFull Video👇\nhttps://t.co/W58KdpqSly\n\n#China #Taiwan"),
 ('1509681951116046336',
  'JUST IN: #Anonymous has hacked &amp; released 62,000 emails from the Marathon Group, a Russian investment firm ow

In [186]:
import nltk
from nltk import word_tokenize
nltk.download('punkt') #to make it work

#ok l'idea è prendere gli hashtag e la parola subito dopo e transportarli nel l'elenco degli hashtags in modo da avere dei dati corretti
#dopo aver preso gli hashtags levo la punteggiatura con string.punctuation che tanto non è rilevante per i baskets

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
rdd_text = rdd_text.map(lambda x:(x[0], word_tokenize(x[1]))) #id, word_tokenized
rdd_text.first()

('1509681950042198030',
 ['⚡The',
  'Ukrainian',
  'Air',
  'Force',
  'would',
  'like',
  'to',
  'address',
  'misinformation',
  'published',
  'in',
  'multiple',
  'Western',
  'media',
  'outlets',
  'regarding',
  'the',
  'situation',
  'in',
  'the',
  '🇺🇦',
  'sky',
  'and',
  'support',
  'from',
  'our',
  '@',
  'NATO',
  'allies',
  '.',
  'More',
  'in',
  '🧵',
  '(',
  '1/16',
  ')',
  '#',
  'ProtectUАSky',
  '#',
  'StopRussia',
  '#',
  'UkraineUnderAttaсk'])

## Come si può vedere dai prossimi blocchi questo metodo funziona meglio di quello categorizzato da twitter quindi sostituisco gli \# nel dataset con quelli che ho computati io nel rdd

In [195]:
hashtags_per_tweet = rdd_text.map(lambda x: [(x[0], x[1][i+1]) for i,word in enumerate(x[1]) if word =='#' ])   #.reduceByKey(lambda x : x) #oppure groupByKey()
#x[1] is the list of words, when I get and # I get back the next word, if a tweet ends with # we have a problem, but that would be a bad created tweet

#creo l'RDD degli hashtags correttamente: STESSA COSA CHE HO FATTO SOPRA MA NON RIESCO A PRINTARLO MI SI BUGGA BOH !!!
rdd_hashtags = hashtags_per_tweet.reduceByKey(lambda x : x)

#se aggiungo reducebykey mi si bugga e mi da errore
#hashtags_per_tweet.take(10)
rdd_hashtags.take(10) 

Py4JJavaError: ignored

In [None]:
df.iloc[1,2] #hashtags di twitter

"[{'text': 'russianinvasion', 'indices': [77, 93]}, {'text': 'StandWithUkraine️', 'indices': [97, 115]}, {'text': 'UkraineUnderAttack', 'indices': [116, 135]}]"

In [None]:
df.iloc[1,1] #testo del tweet, come si può vedere mancano degli hashtags in quelli di twitter

'Chernihiv oblast. Ukrainians welcome their liberators from #russianinvasion. \n\n#StandWithUkraine️ #UkraineUnderAttack #UkraineWillWin #PutinIsaWarCriminal #StopPutin #RussianUkrainianWar #RussiaGoHome #РоссияСмотри #нетвойне https://t.co/86ixYuEtNb'

In [None]:
#devo togliere le parole degli hashtags dal testo,
#a questo punto: l'rdd degli hashtags dovrebbe essere corretto, l'rdd col testo va ancora processato

In [None]:
#per togliere le parole dal testo devo unire i due rdd hashtags e testo in modo da avere una lista di parole che so che dovrò togliere

In [None]:
#servirà dopo in caso facessimo anche rdd_testo
import string     
puncts = string.punctuation 
puncts

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### rdd_hashtags è l'rdd da usare per apriori, prima però andrebbe risolto il problema del reducebykey 

## in caso servano
__________________________________________________________________

In [None]:
#possibile farlo con spark credo per parallelizzare
rows_eng = df.language == 'en'
rows_selected = []

for i,r in enumerate(rows_eng):
  if r == True:
    rows_selected.append(df.iloc[i])

eng_df = pd.DataFrame(rows_selected)

In [None]:
#Creating dictionary for words and dictionary for hashtags
from collections import defaultdict

hashtags = defaultdict(lambda : [])
texts = defaultdict(lambda : [])

In [None]:
for index, row in eng_df.iterrows():
  texts[row.userid].append(row.text)
  hashtags[row.userid].append(row.hashtags)

In [None]:
print(texts[1054170412194553857])
print("_________________________________________________")
print(hashtags[1054170412194553857])

['“From where Winston stood it was just possible to read, picked out on it’s white face in elegant lettering, the three slogans of the Party: \nWAR IS PEACE\nFREEDOM IS SLAVERY\nIGNORANCE IS STRENGTH.”\n\n@POTUS @WhiteHouse \n#Ukraine']
_________________________________________________
["[{'text': 'Ukraine', 'indices': [218, 226]}]"]


In [None]:
while True:
  pass

KeyboardInterrupt: ignored