In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sys
from tqdm.notebook import tqdm

windows_os = False

if windows_os:
    base_path = "C:/Users"
else:
    base_path = r"/media/gianlucanogara/Windows/Users"

path_utils = base_path + "/gianluca.nogara/Desktop/Repo/Vaccines_Discussion_Italy/tweet_utils"
sys.path.append(path_utils)
import utils
import warnings
import glob
warnings.filterwarnings("ignore")

In [None]:
tweets = pd.read_csv(base_path + r"/gianluca.nogara/Desktop/Repo/Vaccines_Discussion_Italy/Italian/files/tweets/tweets.csv",
                     lineterminator="\n", low_memory=False, encoding='utf-8')
tweets

In [None]:
toxicity = pd.DataFrame()
for file in glob.glob(base_path + r"/gianluca.nogara/switchdrive/Project VaccinItaly/Sentiment/per*.csv"):
    toxicity = pd.concat([toxicity, pd.read_csv(file, low_memory=False, encoding='utf-8', lineterminator='\n')])
df_clean = pd.read_csv(base_path + "/gianluca.nogara/switchdrive/Project VaccinItaly/Sentiment/df_cleaned.csv",
                       lineterminator="\n", low_memory=False, encoding="utf-8")


In [None]:
toxicity = toxicity.merge(df_clean, on="text", how="left")
toxicity = toxicity[["SEVERE_TOXICITY", "LIKELY_TO_REJECT", "INSULT", "PROFANITY", "THREAT", "id"]]

In [None]:
tweets_toxic = toxicity.merge(tweets, how="left", on="id")
tweets_toxic

In [None]:
dates = []
for i in tqdm(tweets_toxic.created_at):
    dates.append(utils.process_datetime(i))

In [None]:
tweets_toxic['date'] = dates
count = list(tweets_toxic.date.value_counts().values)
date_count = list(tweets_toxic.date.value_counts().keys())

In [None]:
df_count = pd.DataFrame(list(zip(date_count, count)), columns=["date", "count_status"])

In [None]:
tweets_toxic = tweets_toxic.merge(df_count, on="date", how="left")
# tweets_toxic

In [None]:
tweets_toxic['dates']= tweets_toxic[["SEVERE_TOXICITY","date"]].groupby('date').transform('median')
res_for_plot = tweets_toxic.drop_duplicates(subset='date')
res_for_plot.sort_values(by=["date"], inplace=True)

In [None]:
data = res_for_plot[["date","dates","count_status"]]
data['rolling_avg'] = data.dates.rolling(14).median()

In [None]:
data['count_rolling_avg'] = data.count_status.rolling(14).median()

In [None]:
labels =  []
for i in tqdm(res_for_plot["created_at"]):
    x = i.split(" ")
    month = x[1]
    day = x[2]
    year = x[5][2:]
    value = day + " " + month + " '" + year
    labels.append(value)

In [None]:
data

In [None]:
np.sum(data["count_status"])

In [None]:
fig, ax = plt.subplots(figsize=(25, 10))
plt.xticks(rotation=0, fontsize = 18)
plt.yticks(fontsize = 18)
line_1 = ax.plot(data["date"], data["rolling_avg"], label="Median value of severe toxicity")
# sns.lineplot(x = 'date',
#              y = 'rolling_avg',
#              data = data)
             # label = 'Rolling avg')

ax.set_xlabel('Days', fontsize=18)
ax.set_ylabel('Median toxicity', fontsize=18)

# sns.lineplot(x = 'date',
#              y = 'count_rolling_avg',
#              data = data)
ax2 = ax.twinx()

line_2 = ax2.plot(data["date"], data["count_rolling_avg"], color="r", label="Number of tweets")
ax2.set_ylabel('Tweets count', fontsize=18)
# fig.tight_layout()
lns = line_1+line_2
labs = [l.get_label() for l in lns]
color = "grey"
plt.axvline(x="2020-03-10", color=color) # lockdown (fase 1, 9)
plt.axvline(x="2020-05-04", color=color) # allentamento misure (fase 2, 4)
plt.axvline(x="2020-06-15", color=color) # convivenza (fase 3)
plt.axvline(x="2020-08-11", color=color) # nuove restrizioni (10)
plt.axvline(x="2020-09-06", color=color) # rientro scuole (6)
plt.axvline(x="2020-11-06", color=color) # coprifuoco e zone a colori (6)
plt.axvline(x="2020-12-21", color=color) # ok aifa per vaccino e somministrazioni
plt.axvline(x="2021-03-15", color=color) # Interruzione AZ

plt.yticks(fontsize = 18)
pos = list(data["date"])
lab = labels
plt.xticks(pos, lab)
ax.xaxis.set_major_locator(mdates.DayLocator(interval=28))
ax.get_xaxis().set_visible(True)
ax.legend(lns, labs, loc="upper right", prop={'size': 18})
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(25, 10))
plt.xticks(rotation=0, fontsize = 18)
plt.yticks(fontsize = 18)
line_1 = ax.plot(data["date"], data["rolling_avg"], label="Median value of toxicity")
# sns.lineplot(x = 'date',
#              y = 'rolling_avg',
#              data = data)
             # label = 'Rolling avg')

ax.set_xlabel('Days', fontsize=18)
ax.set_ylabel('Median toxicity', fontsize=18)

# sns.lineplot(x = 'date',
#              y = 'count_rolling_avg',
#              data = data)
ax2 = ax.twinx()

line_2 = ax2.plot(data["date"], data["count_rolling_avg"], color="r", label="Number of tweets")
ax2.set_ylabel('Tweets count', fontsize=18)
# fig.tight_layout()
lns = line_1+line_2
labs = [l.get_label() for l in lns]
color = "grey"
plt.axvline(x="2020-03-10", color=color) # lockdown (fase 1, 9)
plt.axvline(x="2020-05-04", color=color) # allentamento misure (fase 2, 4)
plt.axvline(x="2020-06-15", color=color) # convivenza (fase 3)
plt.axvline(x="2020-08-11", color=color) # nuove restrizioni (10)
plt.axvline(x="2020-09-06", color=color) # rientro scuole (6)
plt.axvline(x="2020-11-06", color=color) # coprifuoco e zone a colori (6)
plt.axvline(x="2020-12-21", color=color) # ok aifa per vaccino e somministrazioni
plt.axvline(x="2021-03-15", color=color) # Interruzione AZ

plt.yticks(fontsize = 18)
pos = list(data["date"])
lab = labels
plt.xticks(pos, lab)
ax.xaxis.set_major_locator(mdates.DayLocator(interval=25))
ax.get_xaxis().set_visible(False)
ax.legend(lns, labs, loc="upper right", prop={'size': 18})
plt.show()

In [None]:
# check correlazioni
data[data["date"]<="2020-06-10"].corr()

In [None]:
pos

In [None]:
# split tweets:
primi_casi = "2020-02-17"
fase_1 = "2020-03-9"
fase_2 = "2020-05-4"
fase_3 = "2020-06-15"
nuove_restizioni = "2020-08-12"
rientro_presenza = "2020-09-6" #scuole in presenza
zone_colori = "2020-11-10"
vaccino = "2020-12-21"
az_interruzione = "2021-03-15"
allentamento_misure_gp = "2021-05-26"

In [None]:
tweets_toxic.head()

In [None]:
tweets_toxic = tweets_toxic.set_index(tweets_toxic['date'])
tweets_toxic = tweets_toxic.sort_index()
tweets_toxic.head()

In [None]:
inizio_primi_casi_df = tweets_toxic[:primi_casi]
primi_casi_fase_1_df = tweets_toxic[primi_casi:fase_1]
fase_1_fase_2_df = tweets_toxic[fase_1:fase_2]
fase_2_fase_3_df = tweets_toxic[fase_2:fase_3]
fase_3_restrizioni_df = tweets_toxic[fase_3:nuove_restizioni]
restrizioni_presenza_df = tweets_toxic[nuove_restizioni:rientro_presenza]
presenza_colori_df = tweets_toxic[rientro_presenza:zone_colori]
colori_vaccino_df = tweets_toxic[zone_colori:vaccino]
vaccino_az_df = tweets_toxic[vaccino:az_interruzione]
az_allentamento_df = tweets_toxic[az_interruzione:allentamento_misure_gp]
# gp_in_poi_df = tweets_toxic[allentamento_misure_gp:]
lst = [inizio_primi_casi_df, primi_casi_fase_1_df, fase_1_fase_2_df, fase_2_fase_3_df, fase_3_restrizioni_df,
       restrizioni_presenza_df, presenza_colori_df, colori_vaccino_df, vaccino_az_df, az_allentamento_df]
names = ["fino primi casi", "primi casi - fase 1", "fase 1 - fase 2", "fase 2 - fase 3", "fase 3 - restrizioni",
         "restrizioni - presenza", "presenza - colori", "colori - vaccino", "vaccino - az", "az - allentamenti"]

In [None]:
for i in range(len(lst)):
    print(f"{names[i]} - {np.nanmedian(lst[i]['SEVERE_TOXICITY'])}, len: {len(lst[i])}")

In [None]:
tweets_toxic_sas = tweets_toxic[tweets_toxic["user_screen_name"]=="TommyBrain"]
# inizio_primi_casi_df = tweets_toxic_sas[:primi_casi]
primi_casi_fase_1_df = tweets_toxic_sas[primi_casi:fase_1]
fase_1_fase_2_df = tweets_toxic_sas[fase_1:fase_2]
fase_2_fase_3_df = tweets_toxic_sas[fase_2:fase_3]
fase_3_restrizioni_df = tweets_toxic_sas[fase_3:nuove_restizioni]
restrizioni_presenza_df = tweets_toxic_sas[nuove_restizioni:rientro_presenza]
presenza_colori_df = tweets_toxic_sas[rientro_presenza:zone_colori]
colori_vaccino_df = tweets_toxic_sas[zone_colori:vaccino]
vaccino_az_df = tweets_toxic_sas[vaccino:az_interruzione]
az_allentamento_df = tweets_toxic_sas[az_interruzione:allentamento_misure_gp]
# gp_in_poi_df = tweets_toxic[allentamento_misure_gp:]
lst = [inizio_primi_casi_df, primi_casi_fase_1_df, fase_1_fase_2_df, fase_2_fase_3_df, fase_3_restrizioni_df,
       restrizioni_presenza_df, presenza_colori_df, colori_vaccino_df, vaccino_az_df, az_allentamento_df]
names = ["fino primi casi", "primi casi - fase 1", "fase 1 - fase 2", "fase 2 - fase 3", "fase 3 - restrizioni",
         "restrizioni - presenza", "presenza - colori", "colori - vaccino", "vaccino - az", "az - allentamenti"]

In [None]:
tweets_toxic_min = tweets_toxic[tweets_toxic["user_screen_name"]=="MinervaMcGrani1"]
inizio_primi_casi_df = tweets_toxic_min[:primi_casi]
primi_casi_fase_1_df = tweets_toxic_min[primi_casi:fase_1]
fase_1_fase_2_df = tweets_toxic_min[fase_1:fase_2]
fase_2_fase_3_df = tweets_toxic_min[fase_2:fase_3]
fase_3_restrizioni_df = tweets_toxic_min[fase_3:nuove_restizioni]
restrizioni_presenza_df = tweets_toxic_min[nuove_restizioni:rientro_presenza]
presenza_colori_df = tweets_toxic_min[rientro_presenza:zone_colori]
colori_vaccino_df = tweets_toxic_min[zone_colori:vaccino]
vaccino_az_df = tweets_toxic_min[vaccino:az_interruzione]
az_allentamento_df = tweets_toxic_min[az_interruzione:allentamento_misure_gp]
# gp_in_poi_df = tweets_toxic[allentamento_misure_gp:]
lst = [inizio_primi_casi_df, primi_casi_fase_1_df, fase_1_fase_2_df, fase_2_fase_3_df, fase_3_restrizioni_df,
       restrizioni_presenza_df, presenza_colori_df, colori_vaccino_df, vaccino_az_df, az_allentamento_df]
names = ["fino primi casi", "primi casi - fase 1", "fase 1 - fase 2", "fase 2 - fase 3", "fase 3 - restrizioni",
         "restrizioni - presenza", "presenza - colori", "colori - vaccino", "vaccino - az", "az - allentamenti"]

In [None]:
# for i in range(len(lst)):
#     x = utils.split(lst[i])
#     labels = list(x.keys())
#     values = list(x.values())
#     utils.print_pie_chart4(f"Attività {names[i]}", labels, values)
#     x = utils.extract_domain_list(lst[i])
#     urls = []
#     lst_not_plot = ["dlvr.it", "twitter.com", "tinyurl.com", "trib.al", "bit.ly", "ow.ly"]
#     cont = 0
#     for v in x:
#         if v != []:
#             for j in v:
#                 if (j not in lst_not_plot):
#                     if "youtu.be" in j:
#                         j = "youtube.com"
#                     urls.append(j)
#     val = pd.Series(urls).value_counts().sort_values(ascending=False)
#     fig = plt.figure()
#     ax = val[:15].plot(kind='barh', color='red')
#     ax.set_title(f'Most URLS shared during {names[i]}')
#     ax.invert_yaxis()
#     plt.show()

In [None]:
for i in range(len(lst)):
    print(f"{names[i]} - {np.nanmedian(lst[i]['SEVERE_TOXICITY'])}, len: {len(lst[i])}")

In [None]:
fig, ax = plt.subplots(figsize=(25, 10))
plt.xticks(rotation=45, fontsize = 15)
plt.yticks(fontsize = 16)
line_1 = ax.plot(data["date"], data["rolling_avg"], label="Median value of toxicity")
# sns.lineplot(x = 'date',
#              y = 'rolling_avg',
#              data = data)
             # label = 'Rolling avg')

ax.set_xlabel('Days', fontsize=18)
ax.set_ylabel('Median toxicity', fontsize=18)

# sns.lineplot(x = 'date',
#              y = 'count_rolling_avg',
#              data = data)
ax2 = ax.twinx()

line_2 = ax2.plot(data["date"], data["count_rolling_avg"], color="r", label="Number of tweets")
ax2.set_ylabel('Tweets count', fontsize=18)
# fig.tight_layout()
lns = line_1+line_2
labs = [l.get_label() for l in lns]
color = "grey"
plt.axvline(x="2020-03-10", color=color) # lockdown (fase 1, 9)
plt.axvline(x="2020-05-04", color=color) # allentamento misure (fase 2, 4)
plt.axvline(x="2020-06-15", color=color) # convivenza (fase 3)
plt.axvline(x="2020-08-11", color=color) # nuove restrizioni (10)
plt.axvline(x="2020-09-06", color=color) # rientro scuole (6)
plt.axvline(x="2020-11-06", color=color) # coprifuoco e zone a colori (6)
plt.axvline(x="2020-12-21", color=color) # ok aifa per vaccino e somministrazioni
plt.axvline(x="2021-03-15", color=color) # Interruzione AZ

plt.yticks(fontsize = 16)
pos = list(data["date"])
lab = labels
plt.xticks(pos, lab)
ax.xaxis.set_major_locator(mdates.DayLocator(interval=14))
ax.legend(lns, labs, loc=2, prop={'size': 15})
plt.show()

In [None]:
tt = tweets_toxic[tweets_toxic["user_screen_name"]=="TommyBrain"]
tt.reset_index(drop=True, inplace=True)
tt['dates'] = tt[["SEVERE_TOXICITY", "date"]].groupby('date').transform('median')
res_for_plot = tt.drop_duplicates(subset='date')
res_for_plot.sort_values(by=["date"], inplace=True)
labels = []
for i in res_for_plot["created_at"]:
    x = i.split(" ")
    month = x[1]
    day = x[2]
    year = x[5][2:]
    value = day + " " + month + " '" + year
    labels.append(value)
data = res_for_plot[["date", "dates"]]
data['rolling_avg'] = data.dates.rolling(14).median()
fig, ax = plt.subplots(figsize=(25, 10))
# sns.lineplot( x = 'date',
#              y = 'dates',
#              data = data,
#              label = 'Toxic median')
sns.lineplot(x='date',
             y='rolling_avg',
             data=data,
             label = 'user_si')
sns.lineplot(x='date',
             y='rolling_avg',
             data=data2,
             label = 'user_no')

ax.set_xlabel('Days', fontsize=18)
ax.set_ylabel('Median toxicity', fontsize=18)
plt.yticks(fontsize = 18)

pos = list(data["date"])
lab = labels
plt.xticks(pos, lab)
plt.xticks(rotation=45, fontsize=18)
color = "grey"
plt.axvline(x="2020-03-10", color=color)  # lockdown (fase 1, 9)
plt.axvline(x="2020-05-04", color=color)  # allentamento misure (fase 2, 4)
plt.axvline(x="2020-06-15", color=color)  # convivenza (fase 3)
plt.axvline(x="2020-08-11", color=color)  # nuove restrizioni (10)
plt.axvline(x="2020-09-07", color=color)  # rientro scuole (6)
plt.axvline(x="2020-11-06", color=color)  # coprifuoco e zone a colori (6)
plt.axvline(x="2020-12-21", color=color)  # ok aifa per vaccino e somministrazioni
plt.axvline(x="2021-03-15", color=color)  # Interruzione AZ
ax.xaxis.set_major_locator(mdates.DayLocator(interval=14))
plt.ylabel('Median toxicity')
plt.legend(labels=["User_1","User_2"],  bbox_to_anchor=(0.74,1), prop={'size': 18})
ax.get_xaxis().set_visible(False)
plt.show()

In [None]:
list(tt.tail(1)["date"])[0]

In [None]:
tt2 = tweets_toxic[tweets_toxic["user_screen_name"]=="MinervaMcGrani1"]
tt2 = tt2[tt2["date"]<=list(tt.tail(1)["date"])[0]]
tt2.reset_index(drop=True, inplace=True)
tt2['dates'] = tt2[["SEVERE_TOXICITY", "date"]].groupby('date').transform('median')
res_for_plot = tt2.drop_duplicates(subset='date')
res_for_plot.sort_values(by=["date"], inplace=True)
labels = []
for i in res_for_plot["created_at"]:
    x = i.split(" ")
    month = x[1]
    day = x[2]
    year = x[5][2:]
    value = day + " " + month + " '" + year
    labels.append(value)
data2 = res_for_plot[["date", "dates"]]
data2['rolling_avg'] = data2.dates.rolling(14).median()
fig, ax = plt.subplots(figsize=(25, 10))
# sns.lineplot( x = 'date',
#              y = 'dates',
#              data = data,
#              label = 'Toxic median')
sns.lineplot(x='date',
             y='rolling_avg',
             data=data2,
             color="red")
# label = 'Rolling avg')

plt.xlabel('Days')

pos = list(data2["date"])
lab = labels
plt.xticks(pos, lab)
plt.xticks(rotation=45, fontsize=10)
color = "grey"
plt.axvline(x="2020-03-06", color=color)  # lockdown (fase 1, 9)
plt.axvline(x="2020-05-04", color=color)  # allentamento misure (fase 2, 4)
plt.axvline(x="2020-06-15", color=color)  # convivenza (fase 3)
plt.axvline(x="2020-08-11", color=color)  # nuove restrizioni (10)
plt.axvline(x="2020-09-07", color=color)  # rientro scuole (6)
plt.axvline(x="2020-11-05", color=color)  # coprifuoco e zone a colori (6)
plt.axvline(x="2020-12-21", color=color)  # ok aifa per vaccino e somministrazioni
plt.axvline(x="2021-03-15", color=color)  # Interruzione AZ
ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
plt.ylabel('Median toxicity')
plt.show()

In [114]:
df = pd.DataFrame()
for chunk in pd.read_csv(r"C:\Users\gianl\Desktop\Gi\Supsi\Vaccines_Discussion_Italy\Italian\files\tweets\tweets.csv",
                lineterminator="\n", encoding="utf-8", low_memory=False,
                 usecols=["id","rt_user_screen_name","user_screen_name"],
                        chunksize=500000):
    df = df.append(chunk)
    print("done")

done
done
done
done
done
done
done
done
done


In [115]:
merged_df = df.merge(tox, on="id", how="left")
merged_df

Unnamed: 0,id,user_screen_name_x,rt_user_screen_name,text,toxicity,user_screen_name_y,urls,hashtags,created_at,in_reply_to_screen_name,rt_created_at,quoted_status_id,in_reply_to_user_id,date,count_status
0,1221739084449832960,planetpaul65,isentinelli,La Memoria è l’unico vaccino contro l’indiffer...,0.006942,planetpaul65,[],"[{'text': 'GiornatadellaMemoria2020', 'indices...",Mon Jan 27 10:17:51 +0000 2020,,Mon Jan 27 08:15:15 +0000 2020,,,2020-01-27,238.0
1,1221737155380293640,Grace_1207,LidaSezOlbia,"Oggi sono arrivati loro, 3 femminucce e 1 masc...",0.503896,Grace_1207,[],[],Mon Jan 27 10:10:11 +0000 2020,,Sat Jan 25 21:19:30 +0000 2020,,,2020-01-27,286.0
2,1221730009989120001,MarisaMinervini,milio967,L'indifferenza è più colpevole della violenza ...,0.037881,MarisaMinervini,[],[],Mon Jan 27 09:41:47 +0000 2020,,Mon Jan 27 05:00:04 +0000 2020,,,2020-01-27,1.0
3,1221737654909382662,LecceSette,,"Minaccia morbillo nel Salento, Asl: ""Vaccinars...",0.065322,LecceSette,"[{'url': 'https://t.co/CY0qiQFk6s', 'expanded_...",[],Mon Jan 27 10:12:10 +0000 2020,,,,,2020-01-27,90.0
4,1221728980203724801,arual812,Annalisa3073,"""Coltivare la Memoria è ancora oggi un vaccino...",0.004234,arual812,[],[],Mon Jan 27 09:37:42 +0000 2020,,Mon Jan 27 07:45:58 +0000 2020,,,2020-01-27,616.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4053196,1394609996197937159,CiaoGrosso,borghi_claudio,Il medico testimonial pro vaccino a Ariachetir...,0.043032,CiaoGrosso,[],[],Tue May 18 11:05:08 +0000 2021,,Tue May 18 10:45:35 +0000 2021,,,2021-05-18,506.0
4053197,1394610015823147011,Rosskitty77,,"""...«Ha lividi e piastrine basse». E ha richie...",0.045092,Rosskitty77,"[{'url': 'https://t.co/5UT12c2sur', 'expanded_...",[],Tue May 18 11:05:13 +0000 2021,,,1.394602e+18,,2021-05-18,1095.0
4053198,1394609887641022464,GuidoIafrate,,"Vaccinazioni nei luoghi di lavoro, fondamental...",0.007983,GuidoIafrate,"[{'url': 'https://t.co/D3j3phCsiC', 'expanded_...",[],Tue May 18 11:04:42 +0000 2021,,,,,2021-05-18,3.0
4053199,1394609899963830275,lucabattanta,valy_s,A lariachetira c’è un MEDICO DI BASE che ha ap...,0.317855,lucabattanta,[],"[{'text': 'lariachetira', 'indices': [14, 27]}]",Tue May 18 11:04:45 +0000 2021,,Tue May 18 10:36:02 +0000 2021,,,2021-05-18,3025.0


In [197]:
rt_minerva = merged_df[merged_df["rt_user_screen_name"]=="MinervaMcGrani1"]
rt_tommy = merged_df[merged_df["rt_user_screen_name"]=="TommyBrain"]

In [123]:
rt_minerva["count_status"] = rt_minerva.groupby('date')['date'].transform('count')
rt_minerva.drop_duplicates(subset=["date"], keep="last", inplace=True)
rt_tommy["count_status"] = rt_tommy.groupby('date')['date'].transform('count')
rt_tommy.drop_duplicates(subset=["date"], keep="last", inplace=True)

In [149]:
np.nanmean(rt_tommy["count_status"])

1.489795918367347

In [150]:
np.nanmean(rt_minerva["count_status"])

80.15151515151516

In [141]:
for row in data.itertuples():
    if row[1] in list(data["date"]):
        print(row[1])

2020-01-03
2020-01-08
2020-01-13
2020-01-15
2020-01-18
2020-01-19
2020-01-20
2020-01-25
2020-01-26
2020-01-27
2020-01-28
2020-01-31
2020-02-02
2020-02-03
2020-02-05
2020-02-06
2020-02-10
2020-02-11
2020-02-21
2020-02-22
2020-02-25
2020-02-26
2020-02-27
2020-02-28
2020-03-01
2020-03-03
2020-03-04
2020-03-05
2020-03-06
2020-03-10
2020-03-11
2020-03-12
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-21
2020-03-22
2020-03-24
2020-03-25
2020-03-26
2020-03-28
2020-03-29
2020-03-31
2020-04-02
2020-04-03
2020-04-07
2020-04-08
2020-04-09
2020-04-11
2020-04-12
2020-04-13
2020-04-14
2020-04-15
2020-04-16
2020-04-17
2020-04-18
2020-04-20
2020-04-21
2020-04-25
2020-04-29
2020-04-30
2020-05-01
2020-05-02
2020-05-03
2020-05-04
2020-05-05
2020-05-06
2020-05-10
2020-05-12
2020-05-13
2020-05-14
2020-05-15
2020-05-16
2020-05-17
2020-05-18
2020-05-19
2020-05-20
2020-05-25
2020-05-26
2020-05-27
2020-05-30
2020-05-31
2020-06-14
2020-06-15
2020-06-17
2020-06-18
2020-06-20
2020-06-21
2020-06-22
2020-06-23

In [132]:
data2

Unnamed: 0,date,dates,rolling_avg
12775,2020-01-02,0.117249,
1986,2020-01-03,0.097633,
5536,2020-01-04,0.023975,
6379,2020-01-05,0.117297,
7481,2020-01-06,0.033074,
...,...,...,...
3884054,2021-05-17,0.099781,0.059529
3522741,2021-05-18,0.073713,0.059529
3554100,2021-05-20,0.057731,0.057531
3555707,2021-05-22,0.218925,0.059728


In [185]:
# tt2["count_status".groupby('date')['date'].transform('count')
np.nanmean(tt2["count_status"])

18.197791490743747

In [184]:
tt2[["date","count_status"]].sort_values(by=["date"],ascending=True)

Unnamed: 0,date,count_status
13002,2020-01-02,12
12895,2020-01-02,12
12881,2020-01-02,12
12872,2020-01-02,12
12775,2020-01-02,12
...,...,...
3561613,2021-05-23,8
3564471,2021-05-23,8
3564537,2021-05-23,8
3564838,2021-05-23,8


In [186]:
tt2[["toxicity"]]

104        0.150427
516        0.100855
566        0.157432
743        0.149567
1618       0.039942
             ...   
3899658    0.019714
3899699    0.259045
3899712    0.139697
3899749    0.073713
3899763    0.010135
Name: toxicity, Length: 3079, dtype: float64

In [198]:
rt_minerva

Unnamed: 0,id,user_screen_name_x,rt_user_screen_name,text,toxicity,user_screen_name_y,urls,hashtags,created_at,in_reply_to_screen_name,rt_created_at,quoted_status_id,in_reply_to_user_id,date,count_status
497,1222298088662093824,piersar62,MinervaMcGrani1,In Toscana si taglia la guardia medica notturn...,0.100855,piersar62,[],[],Tue Jan 28 23:19:08 +0000 2020,,Tue Jan 28 23:16:42 +0000 2020,1.222156e+18,,2020-01-28,1211.0
516,1222302972983508992,Peter_Italy,MinervaMcGrani1,In Toscana si taglia la guardia medica notturn...,0.100855,Peter_Italy,[],[],Tue Jan 28 23:38:32 +0000 2020,,Tue Jan 28 23:16:42 +0000 2020,1.222156e+18,,2020-01-28,474.0
542,1222304567339241473,IMoresi,MinervaMcGrani1,In Toscana si taglia la guardia medica notturn...,0.100855,IMoresi,[],[],Tue Jan 28 23:44:52 +0000 2020,,Tue Jan 28 23:16:42 +0000 2020,1.222156e+18,,2020-01-28,394.0
575,1222298456724779008,Graziel65255465,MinervaMcGrani1,In Toscana si taglia la guardia medica notturn...,0.100855,Graziel65255465,[],[],Tue Jan 28 23:20:35 +0000 2020,,Tue Jan 28 23:16:42 +0000 2020,1.222156e+18,,2020-01-28,1409.0
581,1222297930947858434,AncillottoL,MinervaMcGrani1,In Toscana si taglia la guardia medica notturn...,0.100855,AncillottoL,[],[],Tue Jan 28 23:18:30 +0000 2020,,Tue Jan 28 23:16:42 +0000 2020,1.222156e+18,,2020-01-28,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4052584,1394604748297363456,SemperAdamantes,MinervaMcGrani1,valy_s tommasomatic L’anno scorso non c’erano ...,0.259045,SemperAdamantes,[],[],Tue May 18 10:44:17 +0000 2021,,Mon May 17 23:45:52 +0000 2021,,,2021-05-18,1664.0
4052593,1394605005295038466,RinoRizzardi,MinervaMcGrani1,Lo sapete che le spese della sperimentazione ...,0.099781,RinoRizzardi,"[{'url': 'https://t.co/aW5w6pVMBv', 'expanded_...",[],Tue May 18 10:45:18 +0000 2021,,Mon May 17 18:57:57 +0000 2021,,,2021-05-18,177.0
4052761,1394606089342181380,MercPat,MinervaMcGrani1,Lo sapete che le spese della sperimentazione ...,0.099781,MercPat,"[{'url': 'https://t.co/aW5w6pVMBv', 'expanded_...",[],Tue May 18 10:49:36 +0000 2021,,Mon May 17 18:57:57 +0000 2021,,,2021-05-18,777.0
4052897,1394607696679157761,Loestestest,MinervaMcGrani1,valy_s tommasomatic L’anno scorso non c’erano ...,0.259045,Loestestest,[],[],Tue May 18 10:56:00 +0000 2021,,Mon May 17 23:45:52 +0000 2021,,,2021-05-18,742.0


In [200]:
data2

Unnamed: 0,date,dates,rolling_avg
12775,2020-01-02,0.117249,
1986,2020-01-03,0.097633,
5536,2020-01-04,0.023975,
6379,2020-01-05,0.117297,
7481,2020-01-06,0.033074,
...,...,...,...
3884054,2021-05-17,0.099781,0.059529
3522741,2021-05-18,0.073713,0.059529
3554100,2021-05-20,0.057731,0.057531
3555707,2021-05-22,0.218925,0.059728


In [196]:
rt_minerva["count_status"] = rt_minerva.groupby('date')['date'].transform('count')
rt_minerva[["date","count_status"]].sort_values(by=["date"],ascending=True)

Unnamed: 0,date,count_status
13869,2020-01-02,89.0
13342,2020-01-02,89.0
13363,2020-01-02,89.0
13398,2020-01-02,89.0
13422,2020-01-02,89.0
...,...,...
3737612,,
3738032,,
3752277,,
3828950,,


In [None]:
import pandas as pd

In [None]:
tweets = pd.read_csv(
    base_path + r"/gianluca.nogara/Desktop/Repo/Vaccines_Discussion_Italy/Italian/files/tweets/tweets.csv",
    lineterminator="\n", low_memory=False, encoding='utf-8')
toxicity = pd.DataFrame()
for file in glob.glob(base_path + r"/gianluca.nogara/switchdrive/Project VaccinItaly/Sentiment/per*.csv"):
    toxicity = pd.concat([toxicity, pd.read_csv(file, low_memory=False, encoding='utf-8', lineterminator='\n')])
df_clean = pd.read_csv(base_path + "/gianluca.nogara/switchdrive/Project VaccinItaly/Sentiment/df_cleaned.csv",
                       lineterminator="\n", low_memory=False, encoding="utf-8")

toxicity = toxicity.merge(df_clean, on="text", how="left")
toxicity = toxicity[["SEVERE_TOXICITY", "LIKELY_TO_REJECT", "INSULT", "PROFANITY", "THREAT", "id"]]
tweets_toxic = toxicity.merge(tweets, how="left", on="id")
tweets_toxic

In [None]:
reply = tweets_toxic[tweets_toxic["in_reply_to_user_id"].notna() & tweets_toxic["quoted_status_id"].isna()]
reply

In [None]:
pd.Series(reply["user_screen_name"]).value_counts()