# Data Preparation

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import warnings
import powerlaw
import matplotlib.colors as mcolors
import ast
import time

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.0f}'.format


## Import .csv files + (very) basic EDA and data preparation
In this section we:
* read the four csv files (one per quarter)
* concat all the four csv into a single one
* drop useless features


In [4]:
df_primo = pd.read_csv("../data_collection/data/prima-parte.csv", na_filter=True, na_values='[]')
df_secondo = pd.read_csv("../data_collection/data/SecondaParte.csv", na_filter=True, na_values='[]')
df_terzo = pd.read_csv("../data_collection/data/terza-parte.csv", na_filter=True, na_values='[]')
df_quarto = pd.read_csv("../data_collection/data/quarta_parte.csv", na_filter=True, na_values='[]')

In [5]:
df_primo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46327 entries, 0 to 46326
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       46327 non-null  int64  
 1   id               46327 non-null  int64  
 2   conversation_id  46327 non-null  int64  
 3   created_at       46327 non-null  object 
 4   date             46327 non-null  object 
 5   time             46327 non-null  object 
 6   timezone         46327 non-null  int64  
 7   user_id          46327 non-null  int64  
 8   username         46327 non-null  object 
 9   name             46322 non-null  object 
 10  place            267 non-null    object 
 11  tweet            46327 non-null  object 
 12  language         46327 non-null  object 
 13  mentions         9809 non-null   object 
 14  urls             12494 non-null  object 
 15  photos           7526 non-null   object 
 16  replies_count    46327 non-null  int64  
 17  retweets_cou

In [6]:
df_secondo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346969 entries, 0 to 346968
Data columns (total 37 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       346969 non-null  int64  
 1   id               346969 non-null  int64  
 2   conversation_id  346969 non-null  int64  
 3   created_at       346969 non-null  object 
 4   date             346969 non-null  object 
 5   time             346969 non-null  object 
 6   timezone         346969 non-null  int64  
 7   user_id          346969 non-null  int64  
 8   username         346969 non-null  object 
 9   name             346935 non-null  object 
 10  place            3396 non-null    object 
 11  tweet            346969 non-null  object 
 12  language         346969 non-null  object 
 13  mentions         74932 non-null   object 
 14  urls             94345 non-null   object 
 15  photos           65703 non-null   object 
 16  replies_count    346969 non-null  int6

In [7]:
df_terzo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339622 entries, 0 to 339621
Data columns (total 37 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       339622 non-null  int64  
 1   id               339622 non-null  int64  
 2   conversation_id  339622 non-null  int64  
 3   created_at       339622 non-null  object 
 4   date             339622 non-null  object 
 5   time             339622 non-null  object 
 6   timezone         339622 non-null  int64  
 7   user_id          339622 non-null  int64  
 8   username         339622 non-null  object 
 9   name             339585 non-null  object 
 10  place            1782 non-null    object 
 11  tweet            339622 non-null  object 
 12  language         339622 non-null  object 
 13  mentions         83997 non-null   object 
 14  urls             95047 non-null   object 
 15  photos           70553 non-null   object 
 16  replies_count    339622 non-null  int6

In [8]:
df_quarto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528017 entries, 0 to 528016
Data columns (total 37 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       528017 non-null  int64  
 1   id               528017 non-null  int64  
 2   conversation_id  528017 non-null  int64  
 3   created_at       528017 non-null  object 
 4   date             528017 non-null  object 
 5   time             528017 non-null  object 
 6   timezone         528017 non-null  int64  
 7   user_id          528017 non-null  int64  
 8   username         528017 non-null  object 
 9   name             527967 non-null  object 
 10  place            1244 non-null    object 
 11  tweet            528017 non-null  object 
 12  language         528017 non-null  object 
 13  mentions         143075 non-null  object 
 14  urls             146605 non-null  object 
 15  photos           107944 non-null  object 
 16  replies_count    528017 non-null  int6

In [9]:
joinDF = [df_primo, df_secondo, df_terzo, df_quarto]

In [10]:
df = pd.concat(joinDF)


In [11]:
df

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,2,1232453597595439104,1232453597595439104,2020-02-26 00:53:30 ora solare Europa occidentale,2020-02-26,00:53:30,200,543761210,mimitexasangel,BLM ~ Cannabis Food Heals,...,,,,,,,,,,
1,3,1232453306485657602,1232453306485657600,2020-02-26 00:52:20 ora solare Europa occidentale,2020-02-26,00:52:20,200,2739446098,susankbradford,Susie Bradford,...,,,,,,,,,,
2,4,1232452916134346754,1232452916134346752,2020-02-26 00:50:47 ora solare Europa occidentale,2020-02-26,00:50:47,200,386642768,harbuckscoffee,Mike Parks,...,,,,,,,,,,
3,5,1232452856629719040,1232452856629719040,2020-02-26 00:50:33 ora solare Europa occidentale,2020-02-26,00:50:33,200,4035330274,cote_christal,Christal Côté,...,,,,,,,,,,
4,6,1232450204093206528,1232450204093206528,2020-02-26 00:40:01 ora solare Europa occidentale,2020-02-26,00:40:01,200,532291506,xpressionable,Xpressionable,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528012,5746,1365814283544776705,1365814283544776704,2021-02-28 01:01:15 ora solare Europa occidentale,2021-02-28,01:01:15,200,603051379,supersoccerny,SuperSoccerStarsNY,...,,,,,,,,,,
528013,5747,1365814159246581761,1365814159246581760,2021-02-28 01:00:45 ora solare Europa occidentale,2021-02-28,01:00:45,200,717301826,smhistorymuseum,SM History Museum,...,,,,,,,,,,
528014,5748,1365814158520967171,1365814158520967168,2021-02-28 01:00:45 ora solare Europa occidentale,2021-02-28,01:00:45,200,2191413684,txsthonors,Texas State Honors,...,,,,,,,,,,
528015,5749,1365814155295473665,1365814155295473664,2021-02-28 01:00:44 ora solare Europa occidentale,2021-02-28,01:00:44,200,1217199734982692866,thegr8illusion,Phoenix Wise Ⓥ of the rock band The Resistance ✊🏽,...,,,,,,,,,,


In [12]:
del df["Unnamed: 0"]

In [13]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1260935 entries, 0 to 528016
Data columns (total 36 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   id               1260935 non-null  int64  
 1   conversation_id  1260935 non-null  int64  
 2   created_at       1260935 non-null  object 
 3   date             1260935 non-null  object 
 4   time             1260935 non-null  object 
 5   timezone         1260935 non-null  int64  
 6   user_id          1260935 non-null  int64  
 7   username         1260935 non-null  object 
 8   name             1260809 non-null  object 
 9   place            6689 non-null     object 
 10  tweet            1260935 non-null  object 
 11  language         1260935 non-null  object 
 12  mentions         311813 non-null   object 
 13  urls             348491 non-null   object 
 14  photos           251726 non-null   object 
 15  replies_count    1260935 non-null  int64  
 16  retweets_count   12

In [14]:
len(df["user_id"].unique())

394558

In [15]:
len(df["conversation_id"].value_counts(normalize=True))

903397

In [16]:
df_definitivo = df.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)

In [17]:
df_definitivo.reset_index(inplace=True)

In [20]:
df_definitivo

Unnamed: 0,index,id,conversation_id,date,time,user_id,username,name,tweet,mentions,replies_count,retweets_count,likes_count,hashtags,link,reply_to
0,0,1232453597595439104,1232453597595439104,2020-02-26,00:53:30,543761210,mimitexasangel,BLM ~ Cannabis Food Heals,#Billionaires do not want to become #TaxPayers...,,0,0,2,"['billionaires', 'taxpayers', 'cannabis', 'god...",https://twitter.com/MimiTexasAngel/status/1232...,
1,1,1232453306485657602,1232453306485657600,2020-02-26,00:52:20,2739446098,susankbradford,Susie Bradford,The bigots in this admin are equal opportunity...,,0,0,1,"['amirite', 'whiteprivilege', 'whitesupremacy'...",https://twitter.com/SusanKBradford/status/1232...,
2,2,1232452916134346754,1232452916134346752,2020-02-26,00:50:47,386642768,harbuckscoffee,Mike Parks,C’mon @PeteButtigieg stop faking. You were sc...,"[{'screen_name': 'petebuttigieg', 'name': 'pet...",0,0,0,['blacklivesmatter'],https://twitter.com/HarbucksCoffee/status/1232...,
3,3,1232452856629719040,1232452856629719040,2020-02-26,00:50:33,4035330274,cote_christal,Christal Côté,"Dear Public of #Ontario, are these motions in ...",,1,2,2,"['ontario', 'lso', 'calltoaction', 'indigenous...",https://twitter.com/cote_christal/status/12324...,
4,4,1232450204093206528,1232450204093206528,2020-02-26,00:40:01,532291506,xpressionable,Xpressionable,Mum blames 'racism' after boy who pushed son t...,"[{'screen_name': 'africamustwake', 'name': 'po...",0,1,0,"['racism', 'crimesagainstblackchildren', 'blac...",https://twitter.com/Xpressionable/status/12324...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1260930,528012,1365814283544776705,1365814283544776704,2021-02-28,01:01:15,603051379,supersoccerny,SuperSoccerStarsNY,Celebrating Black History Month Coach Errol Sp...,,0,0,1,"['blackhistory', 'blackhistorymonth', 'blackli...",https://twitter.com/SuperSoccerNY/status/13658...,
1260931,528013,1365814159246581761,1365814159246581760,2021-02-28,01:00:45,717301826,smhistorymuseum,SM History Museum,Civil Rights protest in front of the Santa Mon...,,1,6,10,"['blackhistory', 'civilrights', 'blacklivesmat...",https://twitter.com/SMHistoryMuseum/status/136...,
1260932,528014,1365814158520967171,1365814158520967168,2021-02-28,01:00:45,2191413684,txsthonors,Texas State Honors,Honors celebrates alumni who have completed th...,,0,0,0,"['blacklivesmatter', 'txsthonors']",https://twitter.com/TXSTHonors/status/13658141...,
1260933,528015,1365814155295473665,1365814155295473664,2021-02-28,01:00:44,1217199734982692866,thegr8illusion,Phoenix Wise Ⓥ of the rock band The Resistance ✊🏽,pre-order new album on Bandcamp! $3.99 for 7 s...,,0,1,1,"['ahimsa', 'pacifism', 'progressive', 'lgbtq',...",https://twitter.com/TheGr8Illusion/status/1365...,


In [21]:
del df_definitivo["index"]

In [22]:
df_definitivo = df_definitivo.dropna()

In [23]:
(df_definitivo["date"] <= "2020-05-25").value_counts()

False    34558
True      1116
Name: date, dtype: int64

In [24]:
df_definitivo["retweets_count"].describe()

count   35674
mean        1
std         8
min         0
25%         0
50%         0
75%         0
max       955
Name: retweets_count, dtype: float64

In [25]:
len(df_definitivo["user_id"].unique())

12912

In [27]:
df_definitivo["reply_to"]

100        [{'screen_name': 'MatthewJshow', 'name': 'Matt...
132        [{'screen_name': 'JohnLegere', 'name': 'John L...
157        [{'screen_name': 'moyazb', 'name': 'moyazb', '...
183        [{'screen_name': 'sharon_inouye', 'name': 'Sha...
200        [{'screen_name': 'daveanthony', 'name': 'Judge...
                                 ...                        
1260859    [{'screen_name': 'mikepfingston2', 'name': 'Pi...
1260873    [{'screen_name': 'EngTechnow', 'name': 'Sir  D...
1260882    [{'screen_name': 'TeamPelosi', 'name': 'Nancy ...
1260906    [{'screen_name': 'Wideopenspace5', 'name': 'So...
1260921    [{'screen_name': 'VP', 'name': 'Vice President...
Name: reply_to, Length: 35674, dtype: object

In [28]:
df_definitivo.reset_index(inplace=True)

In [29]:
del df_definitivo["index"]

## Network Preprocessing
* we preprocess the data in order to extract the relationships between users using the _reply_to_ feature
* then we create the edgelist and we assign to each interaction, a weight

In [None]:
df_definitivo['reply_to'] = df_definitivo['reply_to'].map(eval)

In [34]:
tupla = []

In [35]:
for index in range(0, len(df_definitivo)):
    for dizionario in df_definitivo['reply_to'][index]:
 #     print("Mittente ",df_definitivo['user_id'][index], "id ", dizionario['id'])
      tupla.append((df_definitivo['username'][index], df_definitivo["user_id"][index], dizionario['screen_name'], dizionario["id"]))
        
  

df_def = pd.DataFrame(tupla, columns = ["source", "id_source","target", "id_target"])

In [36]:
df_def

Unnamed: 0,source,id_source,target,id_target
0,fun_horse,1489567152,MatthewJshow,762932101
1,fun_horse,1489567152,WhiteHouse,1323730225067339784
2,fun_horse,1489567152,MIT,15460048
3,fun_horse,1489567152,NRA,21829541
4,fun_horse,1489567152,ODNIgov,56838279
...,...,...,...,...
115662,crystalclean757,988937151843794944,Blackamazon,18052474
115663,hanifactor,860145770653155330,VP,803694179079458816
115664,hanifactor,860145770653155330,NASA,11348282
115665,hanifactor,860145770653155330,USNavy,54885400


In [37]:
df_def.to_csv("../data_collection/data/df_per_grafo.csv")

In [18]:
df_def = pd.read_csv("../data_collection/data/df_per_grafo.csv")


In [19]:
df_def

Unnamed: 0.1,Unnamed: 0,source,id_source,target,id_target
0,0,fun_horse,1489567152,MatthewJshow,762932101
1,1,fun_horse,1489567152,WhiteHouse,1323730225067339784
2,2,fun_horse,1489567152,MIT,15460048
3,3,fun_horse,1489567152,NRA,21829541
4,4,fun_horse,1489567152,ODNIgov,56838279
...,...,...,...,...,...
115662,115662,crystalclean757,988937151843794944,Blackamazon,18052474
115663,115663,hanifactor,860145770653155330,VP,803694179079458816
115664,115664,hanifactor,860145770653155330,NASA,11348282
115665,115665,hanifactor,860145770653155330,USNavy,54885400


In [20]:
del df_def["Unnamed: 0"]

In [23]:
df_def_weighted = df_def.value_counts().to_frame("weights").reset_index()

In [27]:
df_def_weighted

Unnamed: 0,source,id_source,target,id_target,weights
0,motor_felipe,1236537956610887680,allanldsantos,52048790,155
1,pengologist,16548831,SeattlePD,25101704,100
2,info4u2know,47725408,thehill,1917731,99
3,soloyochapin,93071854,realDonaldTrump,25073877,83
4,info4u2know,47725408,realDonaldTrump,25073877,82
...,...,...,...,...,...
81058,mrsfosters4th,1177287673,CMcCabeWCPSS,1138145019024486400,1
81059,mrsesantamaria,337449425,blkeducator,170624419,1
81060,mrsefter,755904465765629952,vscouncil,2834551997,1
81061,mrsefter,755904465765629952,seydric7,150300335,1


* normalize the weight value

In [29]:
df_def_weighted["weights"] = (df_def_weighted["weights"] - (df_def_weighted["weights"]).min())/((df_def_weighted["weights"].max()-(df_def_weighted["weights"]).min() ))

In [35]:
df_def_weighted.to_csv("../data_collection/data/df_per_grafo_pesato.csv")

## Data preprocessing for DCD

* we also prepared a dataset containing the quarter information


In [1]:
df_primo = pd.read_csv("../data_collection/data/prima-parte.csv", na_filter=True, na_values='[]')
df_secondo = pd.read_csv("../data_collection/data/SecondaParte.csv", na_filter=True, na_values='[]')
df_terzo = pd.read_csv("../data_collection/data/terza-parte.csv", na_filter=True, na_values='[]')
df_quarto = pd.read_csv("../data_collection/data/quarta_parte.csv", na_filter=True, na_values='[]')

NameError: name 'pd' is not defined

In [8]:
del df_primo["Unnamed: 0"]
del df_secondo["Unnamed: 0"]
del df_terzo["Unnamed: 0"]
del df_quarto["Unnamed: 0"]

In [11]:
df_definitivo_1 = df_primo.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)
df_definitivo_2 = df_secondo.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)
df_definitivo_3 = df_terzo.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)
df_definitivo_4 = df_quarto.drop(["retweet_date", "translate", "trans_src", "trans_dest", "near", "geo", "source", "user_rt_id","user_rt", "retweet_id", "place", "thumbnail", "video", "language", "photos" , "created_at", "timezone", "cashtags", "urls", "retweet", "quote_url"], axis=1)

In [21]:
df_definitivo_1.reset_index(inplace=True)
df_definitivo_2.reset_index(inplace=True)
df_definitivo_3.reset_index(inplace=True)
df_definitivo_4.reset_index(inplace=True)

In [22]:
del df_definitivo_1["index"]
del df_definitivo_2["index"]
del df_definitivo_3["index"]
del df_definitivo_4["index"]


In [15]:
df_definitivo_1 = df_definitivo_1.dropna()
df_definitivo_2 = df_definitivo_2.dropna()
df_definitivo_3 = df_definitivo_3.dropna()
df_definitivo_4 = df_definitivo_4.dropna()

In [17]:
df_definitivo_1['reply_to'] = df_definitivo_1['reply_to'].map(eval)
df_definitivo_2['reply_to'] = df_definitivo_2['reply_to'].map(eval)
df_definitivo_3['reply_to'] = df_definitivo_3['reply_to'].map(eval)
df_definitivo_4['reply_to'] = df_definitivo_4['reply_to'].map(eval)

In [24]:
lista1 = []
lista2 = []
lista3 = []
lista4 = []

In [28]:
for index in range(0, len(df_definitivo_1)):
    for dizionario in df_definitivo_1['reply_to'][index]:
      lista1.append((df_definitivo_1['username'][index], df_definitivo_1["user_id"][index], dizionario['screen_name'], dizionario["id"], 1))
        
  

df_Q1 = pd.DataFrame(lista1, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "quarter"])

In [30]:
for index in range(0, len(df_definitivo_2)):
    for dizionario in df_definitivo_2['reply_to'][index]:
      lista2.append((df_definitivo_2['username'][index], df_definitivo_2["user_id"][index], dizionario['screen_name'], dizionario["id"], 2))
        
  

df_Q2 = pd.DataFrame(lista2, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "quarter"])

for index in range(0, len(df_definitivo_3)):
    for dizionario in df_definitivo_3['reply_to'][index]:
      lista3.append((df_definitivo_3['username'][index], df_definitivo_3["user_id"][index], dizionario['screen_name'], dizionario["id"], 3))
        
  

df_Q3 = pd.DataFrame(lista3, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "quarter"])

for index in range(0, len(df_definitivo_4)):
    for dizionario in df_definitivo_4['reply_to'][index]:
      lista4.append((df_definitivo_4['username'][index], df_definitivo_4["user_id"][index], dizionario['screen_name'], dizionario["id"], 4))
        
df_Q4 = pd.DataFrame(lista4, columns = ["username_mittente", "id_mittente","username_ricevente", "id_ricevente", "quarter"])

In [31]:
df_Q1_w = df_Q1.value_counts().to_frame("weights").reset_index()
df_Q2_w = df_Q2.value_counts().to_frame("weights").reset_index()
df_Q3_w = df_Q3.value_counts().to_frame("weights").reset_index()
df_Q4_w = df_Q4.value_counts().to_frame("weights").reset_index()

In [32]:
df_Q1_w["weights"] = (df_Q1_w["weights"] - (df_Q1_w["weights"]).min())/((df_Q1_w["weights"].max()-(df_Q1_w["weights"]).min() ))
df_Q2_w["weights"] = (df_Q2_w["weights"] - (df_Q2_w["weights"]).min())/((df_Q2_w["weights"].max()-(df_Q2_w["weights"]).min() ))
df_Q3_w["weights"] = (df_Q3_w["weights"] - (df_Q3_w["weights"]).min())/((df_Q3_w["weights"].max()-(df_Q3_w["weights"]).min() ))
df_Q4_w["weights"] = (df_Q4_w["weights"] - (df_Q4_w["weights"]).min())/((df_Q4_w["weights"].max()-(df_Q4_w["weights"]).min() ))

In [34]:
df_Q1_w.to_csv("../data_collection/data/df_Q1.csv")
df_Q2_w.to_csv("../data_collection/data/df_Q2.csv")
df_Q3_w.to_csv("../data_collection/data/df_Q3.csv")
df_Q4_w.to_csv("../data_collection/data/df_Q4.csv")