## Spark setting

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz
!tar xf spark-3.0.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7"

In [3]:
import findspark
findspark.init("spark-3.0.3-bin-hadoop2.7")# SPARK_HOME
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
import pyspark
type(spark)

pyspark.sql.session.SparkSession

## Kaggle APIs

________________________________________________________________________

In [5]:
from google.colab import files
files.upload() #upload kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"federicofiorio","key":"3c9fda0dd26cc20d5f652a577743142c"}'}

In [6]:
! pip install kaggle



In [7]:
! mkdir ~/.kaggle

In [8]:
!cp kaggle.json ~/.kaggle/

In [9]:
!ls ~/.kaggle

kaggle.json


In [10]:
!chmod 600 /root/.kaggle/kaggle.json

In [11]:
!kaggle datasets download -d bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows

Downloading ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip to /content
100% 7.30G/7.31G [00:55<00:00, 212MB/s]
100% 7.31G/7.31G [00:55<00:00, 142MB/s]


In [12]:
!ls

kaggle.json
sample_data
spark-3.0.3-bin-hadoop2.7
spark-3.0.3-bin-hadoop2.7.tgz
ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip


In [13]:
!unzip ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip

Archive:  ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip
  inflating: 0401_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0402_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0403_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0404_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0405_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0406_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0407_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0408_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0409_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0410_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0411_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0412_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0413_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0414_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0415_UkraineCombinedTweetsDeduped.csv.gzip  
  inflating: 0416_UkraineCombinedTweetsDeduped.csv.gzip  
  inflat

## Use the data

________________________________________________________________________

In [14]:
import numpy as np
import pandas as pd
import csv
import os
import warnings

In [15]:
filename = r"0401_UkraineCombinedTweetsDeduped.csv.gzip"
df = pd.read_csv(filename, compression='gzip', index_col=0,encoding='utf-8', quoting=csv.QUOTE_ALL)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [16]:
df[0:2].T

Unnamed: 0,0,1
userid,16882774,3205296069
username,Yaniela,gregffff
acctdesc,"Animal lover, supports those who fight injusti...",
location,Hawaii,
following,1158,122
followers,392,881
totaltweets,88366,99853
usercreatedts,2008-10-21 07:34:04.000000,2015-04-25 11:24:34.000000
tweetid,1509681950042198030,1509681950151348229
tweetcreatedts,2022-04-01 00:00:00.000000,2022-04-01 00:00:00.000000


In [17]:
type(df)

pandas.core.frame.DataFrame

In [18]:
df = df[['tweetid', 'text', 'hashtags','language']] #keep only 3 columns
df.head()

Unnamed: 0,tweetid,text,hashtags,language
0,1509681950042198030,⚡The Ukrainian Air Force would like to address...,[],en
1,1509681950151348229,Chernihiv oblast. Ukrainians welcome their lib...,"[{'text': 'russianinvasion', 'indices': [77, 9...",en
2,1509681950683926556,America 🇺🇸 is preparing for something worse th...,"[{'text': 'RussianUkrainianWar', 'indices': [7...",en
3,1509681951116046336,JUST IN: #Anonymous has hacked &amp; released ...,"[{'text': 'Anonymous', 'indices': [25, 35]}]",en
4,1509681951304990720,***PUBLIC MINT NOW LIVE***\n\nFor \n@billionai...,[],en


In [19]:
pd.options.display.max_colwidth = 1200
print(df[df['tweetid']==	1509681950042198030]["text"],df[df['tweetid']==	1509681950042198030]["hashtags"])

0    ⚡The Ukrainian Air Force would like to address misinformation published in multiple Western media outlets regarding the situation in the 🇺🇦 sky and support from our @NATO allies. \nMore in 🧵(1/16)\n#ProtectUАSky #StopRussia #UkraineUnderAttaсk
Name: text, dtype: object 0    []
Name: hashtags, dtype: object


As we can see in the previous cell, the tweet has the hashtag #StopRussia #UkraineUnderAttaсk BUT the dataframe (1st tuple) sais there are no hashtag in the tweet. We will try to recover form this.

In [20]:
from pyspark.sql.types import *

schema = StructType([StructField("tweetid", StringType(), True)\
                   ,StructField("text", StringType(), True)\
                   ,StructField("hashtags", StringType(), True)\
                   ,StructField("language", StringType(), True)])

#create spark dataframe using schema
df_spark = spark.createDataFrame(df,schema=schema)

In [21]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [22]:
df_spark.show()

+-------------------+--------------------+--------------------+--------+
|            tweetid|                text|            hashtags|language|
+-------------------+--------------------+--------------------+--------+
|1509681950042198030|⚡The Ukrainian Ai...|                  []|      en|
|1509681950151348229|Chernihiv oblast....|[{'text': 'russia...|      en|
|1509681950683926556|America 🇺🇸 is p...|[{'text': 'Russia...|      en|
|1509681951116046336|JUST IN: #Anonymo...|[{'text': 'Anonym...|      en|
|1509681951304990720|***PUBLIC MINT NO...|                  []|      en|
|1509681952000937999|The Amazing story...|[{'text': 'Russia...|      en|
|1509681952978210849|&amp;quot;How we ...|                  []|      en|
|1509681953053843466|India's purchase ...|[{'text': 'Russia...|      en|
|1509681953091457035|The most basic te...|[{'text': 'Ukrain...|      en|
|1509681953418711050|"The image that R...|[{'text': 'Putin'...|      en|
|1509681953418752008|#Russia’s Preside...|[{'text': '

## RDD TEXT SERVE PER PRENDERE GLI HASHTAG DAL TWEET IN QUANTO LA COLONNA HASHTAG DEL DATASET NON è ACCURATA

In [142]:
import nltk
from nltk import word_tokenize
import re
nltk.download('punkt') #to make it work

rdd = df_spark.rdd #to get the rdd from dataframe

rdd.take(10)
rdd=rdd.filter(lambda x: x[3] == "en") #filter out non english tweets


rdd_text = rdd.map(lambda x : (x[0], x[1])).map(lambda x:(x[0], word_tokenize(x[1]))) #id, word_tokenized

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## ESTRAGGO SOLO GLI HASHTAGS

In [143]:
def processHashtags(tweet):
  hashtags = []
  for i,word in enumerate(tweet):
    if word == '#':
      try:
        hashtags.append(re.sub(r'[^\w\s]','',tweet[i+1]).lower()) #remove punctuation with regexp and put them lowercase
      except:
        pass
  return hashtags

hashtags_per_tweet = rdd_text.map(lambda x: (x[0], processHashtags(x[1])))
hashtags_per_tweet.first()

('1509681950042198030', ['protectuаsky', 'stoprussia', 'ukraineunderattaсk'])

In [144]:
#reconvert the rdd to spark dataframe
deptColumns = ["tweet","hashtag"]
new_df_spark = hashtags_per_tweet.toDF(deptColumns)
new_df_spark.printSchema()
new_df_spark.show(truncate=False)

root
 |-- tweet: string (nullable = true)
 |-- hashtag: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|tweet              |hashtag                                                                                                                                                                   |
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1509681950042198030|[protectuаsky, stoprussia, ukraineunderattaсk]                                                                                                                            |
|1509681950151348229|[russianinvasion, standwithukraine, ukraineunderattack, ukrainewillwin, putini

APRIORI

In [145]:
hashtag=new_df_spark.select('hashtag')

In [146]:
basket_file = hashtag.rdd.flatMap(list)

In [147]:
basket_file.take(10)

[['protectuаsky', 'stoprussia', 'ukraineunderattaсk'],
 ['russianinvasion',
  'standwithukraine',
  'ukraineunderattack',
  'ukrainewillwin',
  'putinisawarcriminal',
  'stopputin',
  'russianukrainianwar',
  'russiagohome',
  'россиясмотри',
  'нетвойне'],
 ['russianukrainianwar', 'china', 'taiwan'],
 ['anonymous', 'oprussia', 'ddosecrets'],
 ['nft', 'mint'],
 ['russia',
  'ukraine',
  'motivation',
  'netde',
  'edude',
  'delaware',
  'government',
  'usa'],
 ['ukraine', 'ukrainewar', 'russia', 'ukraineinvasion'],
 ['russian', 'moscow'],
 ['ukraine'],
 ['putin', 'medvedev', 'russia', 'ukraine']]

## Riduco dimensioni baskets per essere veloci

In [148]:
num_baskets = 500
basket_file = basket_file.take(num_baskets) 

In [149]:
basket_file = spark.sparkContext.parallelize(basket_file)

In [150]:
type(basket_file)

pyspark.rdd.RDD

In [151]:
basket_file.take(10)

[['protectuаsky', 'stoprussia', 'ukraineunderattaсk'],
 ['russianinvasion',
  'standwithukraine',
  'ukraineunderattack',
  'ukrainewillwin',
  'putinisawarcriminal',
  'stopputin',
  'russianukrainianwar',
  'russiagohome',
  'россиясмотри',
  'нетвойне'],
 ['russianukrainianwar', 'china', 'taiwan'],
 ['anonymous', 'oprussia', 'ddosecrets'],
 ['nft', 'mint'],
 ['russia',
  'ukraine',
  'motivation',
  'netde',
  'edude',
  'delaware',
  'government',
  'usa'],
 ['ukraine', 'ukrainewar', 'russia', 'ukraineinvasion'],
 ['russian', 'moscow'],
 ['ukraine'],
 ['putin', 'medvedev', 'russia', 'ukraine']]

_______________________________________________________________________________

In [152]:
#DA RIVEDERE
count = 250000 #obtaine doing count=basket_file.count() (too much time to repeat this all the time)
threshold = 2
print(count, threshold)

250000 2


Start apriori

In [153]:
#setp 1 calculate the freq of each item in the basket file
singleton=basket_file.flatMap(list).map(lambda item: (item,1)).reduceByKey(lambda a,b: a+b)

singleton.take(20)

[('ukraineunderattaсk', 27),
 ('russianinvasion', 20),
 ('standwithukraine', 51),
 ('ukrainewillwin', 22),
 ('stopputin', 26),
 ('russianukrainianwar', 29),
 ('russiagohome', 20),
 ('нетвойне', 20),
 ('taiwan', 1),
 ('anonymous', 11),
 ('ddosecrets', 2),
 ('netde', 1),
 ('delaware', 1),
 ('usa', 5),
 ('ukraineinvasion', 2),
 ('moscow', 4),
 ('putin', 32),
 ('medvedev', 1),
 ('russianarmy', 2),
 ('soviet', 2)]

## PHASE 2

In [154]:
#step 2: filter out all the non frequent singleton
freq_singleton=singleton.filter(lambda x: x[1]>=threshold)

freq_singleton.take(20)

[('ukraineunderattaсk', 27),
 ('russianinvasion', 20),
 ('standwithukraine', 51),
 ('ukrainewillwin', 22),
 ('stopputin', 26),
 ('russianukrainianwar', 29),
 ('russiagohome', 20),
 ('нетвойне', 20),
 ('anonymous', 11),
 ('ddosecrets', 2),
 ('usa', 5),
 ('ukraineinvasion', 2),
 ('moscow', 4),
 ('putin', 32),
 ('russianarmy', 2),
 ('soviet', 2),
 ('ukrainian', 31),
 ('kharkiv', 4),
 ('peace', 4),
 ('zelensky', 7)]

In [155]:
#step 3: now create all the possible pairs made up by frequen singletons

from itertools import combinations
pairs=list(combinations(freq_singleton.map(lambda x: x[0]).toLocalIterator(),2)) #without toLocalIterator error (reference https://stackoverflow.com/questions/32771737/convert-an-rdd-to-iterable-pyspark)

pairs[:10]

[('ukraineunderattaсk', 'russianinvasion'),
 ('ukraineunderattaсk', 'standwithukraine'),
 ('ukraineunderattaсk', 'ukrainewillwin'),
 ('ukraineunderattaсk', 'stopputin'),
 ('ukraineunderattaсk', 'russianukrainianwar'),
 ('ukraineunderattaсk', 'russiagohome'),
 ('ukraineunderattaсk', 'нетвойне'),
 ('ukraineunderattaсk', 'anonymous'),
 ('ukraineunderattaсk', 'ddosecrets'),
 ('ukraineunderattaсk', 'usa')]

### prima implementazione apriori phase 2 counting

In [156]:
#step 4 : filter out the non frequent pairs
pairs_freq_sing=basket_file.map(lambda x: list(combinations(x,2)))  #per ogni basket, creo ogni possiblie coppia

#flat and count and reduce
flat_pairs=singleton=pairs_freq_sing.flatMap(list).map(lambda item: (item,1)).reduceByKey(lambda a,b: a+b)
flat_pairs.take(5)

[(('protectuаsky', 'stoprussia'), 20),
 (('russianinvasion', 'standwithukraine'), 20),
 (('russianinvasion', 'ukrainewillwin'), 20),
 (('russianinvasion', 'stopputin'), 20),
 (('russianinvasion', 'russianukrainianwar'), 20)]

In [157]:
#step 5: finally filetr out the pairs < thresold
freq_pairs=flat_pairs.filter(lambda x: x[1]>=threshold)

freq_pairs.collect()

[(('protectuаsky', 'stoprussia'), 20),
 (('russianinvasion', 'standwithukraine'), 20),
 (('russianinvasion', 'ukrainewillwin'), 20),
 (('russianinvasion', 'stopputin'), 20),
 (('russianinvasion', 'russianukrainianwar'), 20),
 (('russianinvasion', 'russiagohome'), 20),
 (('russianinvasion', 'нетвойне'), 20),
 (('standwithukraine', 'ukrainewillwin'), 20),
 (('standwithukraine', 'stopputin'), 22),
 (('standwithukraine', 'russianukrainianwar'), 20),
 (('standwithukraine', 'russiagohome'), 20),
 (('standwithukraine', 'нетвойне'), 20),
 (('ukraineunderattack', 'putinisawarcriminal'), 20),
 (('ukraineunderattack', 'россиясмотри'), 20),
 (('ukrainewillwin', 'stopputin'), 20),
 (('ukrainewillwin', 'russianukrainianwar'), 20),
 (('ukrainewillwin', 'russiagohome'), 20),
 (('ukrainewillwin', 'нетвойне'), 20),
 (('putinisawarcriminal', 'россиясмотри'), 20),
 (('stopputin', 'russianukrainianwar'), 20),
 (('stopputin', 'russiagohome'), 20),
 (('stopputin', 'нетвойне'), 20),
 (('russianukrainianwar', 

### altra implementazione apriori secondo pass counting

In [158]:
#flatmap so I get all the prev computed pairs in only 1 list to be able to perform reduceByKey
flatted_couples = basket_file.map(lambda x: [(pair,1) for pair in pairs if set(pair).issubset(set(x))]).flatMap(lambda x: x).cache()

In [159]:
reduced_elements = flatted_couples.reduceByKey(lambda a, b: a + b)

In [160]:
freq_pairs = reduced_elements.filter(lambda x : x[1] >= threshold).cache()

In [161]:
freq_pairs.collect()

[(('protectuаsky', 'stoprussia'), 20),
 (('russianinvasion', 'standwithukraine'), 20),
 (('russianinvasion', 'ukrainewillwin'), 20),
 (('russianinvasion', 'stopputin'), 20),
 (('russianinvasion', 'russianukrainianwar'), 20),
 (('russianinvasion', 'russiagohome'), 20),
 (('russianinvasion', 'нетвойне'), 20),
 (('standwithukraine', 'ukrainewillwin'), 20),
 (('standwithukraine', 'stopputin'), 23),
 (('standwithukraine', 'russianukrainianwar'), 20),
 (('standwithukraine', 'russiagohome'), 20),
 (('standwithukraine', 'нетвойне'), 20),
 (('ukrainewillwin', 'stopputin'), 20),
 (('ukrainewillwin', 'russianukrainianwar'), 20),
 (('ukrainewillwin', 'russiagohome'), 20),
 (('ukrainewillwin', 'нетвойне'), 20),
 (('stopputin', 'russianukrainianwar'), 20),
 (('stopputin', 'russiagohome'), 20),
 (('stopputin', 'нетвойне'), 20),
 (('russianukrainianwar', 'russiagohome'), 20),
 (('russianukrainianwar', 'нетвойне'), 20),
 (('russiagohome', 'нетвойне'), 20),
 (('ukraineunderattack', 'putinisawarcriminal'

## PCY 

In [214]:
#first pass is the same of apriori
singleton = basket_file.flatMap(list).map(lambda item: (item,1)).reduceByKey(lambda a,b: a+b)


In [262]:
from sys import hash_info
#different step w.r.t. apriori, creating all the pairs (non from freq singleton) and hashing them
#creating hashtable, fixed size:

HASH_TABLE_SIZE = 50000
hash_table = np.zeros(HASH_TABLE_SIZE, dtype=int)

def hashing(pair_to_hash):
  return int(abs(hash(frozenset(pair_to_hash))) % HASH_TABLE_SIZE)


hashing(("stoprussia", "standwithukraine")) #it's equal to the reverse because of frozenset

42051

In [260]:
#create all possible pairs and hash them
from itertools import combinations
pairs_first_pass = list(combinations(singleton.map(lambda x: x[0]).toLocalIterator(),2))#creating all pairs

#pairs_first_rdd = spark.sparkContext.parallelize(pairs_first_pass)


#I dont like the for, use map reduce IF POSSIBLE
for element in pairs_first_pass:
  hash_table[hashing(element)] +=1

bitmap_freq = [hash_table[i]>=threshold for i in range(HASH_TABLE_SIZE)] #creo bitmap, scarto pairs non freq
bitmap_freq[:10]

[True, True, True, False, True, True, False, True, False, False]

## secondo pass PCY

In [263]:
#per ogni freq singleton, devo controllare che la sua coppia sia freq nella hashtable e poi contarlo
#dopo averli contati tutti riduco in base alla threshold
from itertools import combinations

freq_singleton = singleton.filter(lambda x: x[1]>=threshold)                           
pairs=list(combinations(freq_singleton.map(lambda x: x[0]).toLocalIterator(),2)) #formed from freq singletons

In [264]:
#CONVERT FREQ SINGELTONS IN LIST
freq_singleton_list = list(freq_singleton.map(lambda x: x).toLocalIterator())
#CONVERT PAIRS INTO RDD
pairs_rdd = spark.sparkContext.parallelize(pairs)

In [265]:
#candidate pairs = singleton freq and their pair is freq in the hash_table
candidate_pairs = pairs_rdd.filter(lambda x : bitmap_freq[hashing(x)] == True)
candidate_pairs_list = list(pairs_rdd.map(lambda x: x).toLocalIterator())

In [266]:
#check against threshold
flatted_couples = basket_file.map(lambda x: [(pair,1) for pair in candidate_pairs_list if set(pair).\
                                             issubset(set(x))]).flatMap(lambda x: x).cache()

reduced_elements = flatted_couples.reduceByKey(lambda a, b: a + b)
freq_pairs = reduced_elements.filter(lambda x : x[1] >= threshold).cache()
freq_pairs.collect()

[(('protectuаsky', 'stoprussia'), 20),
 (('russianinvasion', 'standwithukraine'), 20),
 (('russianinvasion', 'ukrainewillwin'), 20),
 (('russianinvasion', 'stopputin'), 20),
 (('russianinvasion', 'russianukrainianwar'), 20),
 (('russianinvasion', 'russiagohome'), 20),
 (('russianinvasion', 'нетвойне'), 20),
 (('standwithukraine', 'ukrainewillwin'), 20),
 (('standwithukraine', 'stopputin'), 23),
 (('standwithukraine', 'russianukrainianwar'), 20),
 (('standwithukraine', 'russiagohome'), 20),
 (('standwithukraine', 'нетвойне'), 20),
 (('ukrainewillwin', 'stopputin'), 20),
 (('ukrainewillwin', 'russianukrainianwar'), 20),
 (('ukrainewillwin', 'russiagohome'), 20),
 (('ukrainewillwin', 'нетвойне'), 20),
 (('stopputin', 'russianukrainianwar'), 20),
 (('stopputin', 'russiagohome'), 20),
 (('stopputin', 'нетвойне'), 20),
 (('russianukrainianwar', 'russiagohome'), 20),
 (('russianukrainianwar', 'нетвойне'), 20),
 (('russiagohome', 'нетвойне'), 20),
 (('ukraineunderattack', 'putinisawarcriminal'

In [256]:
mylist = ['apple', 'banana', 'cherry']
x = frozenset(mylist)
print(x)
y = set(["apple"])
print(y)
y.issubset(x)

hashing(x)

frozenset({'banana', 'apple', 'cherry'})
{'apple'}


29245

In [244]:
while True:
  pass

KeyboardInterrupt: ignored