# Hashtags study-Novax users

## Import

In [None]:
import pandas as pd
import json
import numpy as np
from itertools import repeat

from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

from concurrent.futures import wait as futures_wait
from concurrent.futures.process import ProcessPoolExecutor

import importlib
import hashtag_util as ut
import sys
sys.path.insert(0, '../')
import general_utils as gen_ut

## Dataset preparation

In [None]:
df = pd.read_csv('../tweets_novax.csv',low_memory=False,
                 usecols=['user_screen_name','hashtags','created_at'])

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")

In [None]:
#Creating a map of all hashtags with the number of uses
listHashtags = []

for s in df['hashtags']:    
    [ listHashtags.append(x) for x in gen_ut.get_string_json(s,'text') ]
    
dfHashtags = pd.DataFrame()
dfHashtags['hashtags'] = listHashtags
dfHashtags['count'] = 0

dfHashtags = dfHashtags.groupby('hashtags').count()
dfHashtags.sort_values(['count'],axis = 0,inplace=True,ascending=False)

dfHashtags

In [None]:
importlib.reload(ut)
dfUse = ut.process_dfUse(df)
dfUse

## General study

In [None]:
importlib.reload(ut)
ut.visual_histogram(dfHashtags,200,50)

In [None]:
ut.visual_by_date_together(dfHashtags,dfUse)

In [None]:
ut.visual_by_date_split(dfHashtags,dfUse)

## Study without some hastag

In [None]:
hastagRemove = ['vaccin.*','covid.*','corona.*','astrazeneca','pfizer','sarscov2','sputnikv','moderna']
dfHashtagFiltered = dfHashtags
for r in hastagRemove:
    mask = dfHashtagFiltered.index.str.lower().str.match(r) == True
    dfHashtagFiltered.drop(dfHashtagFiltered[mask].index, inplace=True)

dfHashtagFiltered

In [None]:
ut.visual_histogram(dfHashtagFiltered,100,50)

In [None]:
ut.visual_by_date_together(dfHashtagFiltered,dfUse)

In [None]:
ut.visual_by_date_split(dfHashtagFiltered,dfUse)

## Study without some hastag and hastags about politic

In [None]:
dfMoreFiltered = dfHashtagFiltered
hastagRemove = ['.*lombardia.*','draghi','conte','m5s','mattarella','salvini','speranza','renzi','lega','.*governo.*',
           '.*moratti.*','zingaretti','scanzi','burioni','crisanti']
for r in hastagRemove:
    mask = dfMoreFiltered.index.str.lower().str.match(r) == True
    dfMoreFiltered.drop(dfMoreFiltered[mask].index, inplace=True)

dfMoreFiltered

In [None]:
ut.visual_histogram(dfMoreFiltered,100,50)

In [None]:
ut.visual_by_date_together(dfMoreFiltered,dfUse)

In [None]:
ut.visual_by_date_split(dfMoreFiltered,dfUse)

## Use of 'suspect' hashtag

In [None]:
listHashtagsStudy = ['5g','billgates','dittatura*.','disobbedisco','nessunacorrelazione','byoblu*.']

In [None]:
dfSuspect = pd.DataFrame(index=listHashtagsStudy)
for r in listHashtagsStudy:
    mask = dfMoreFiltered.index.str.lower().str.match(r) == True
    dfSuspect.loc[r,'count'] = sum(dfMoreFiltered.loc[mask,'count'])
dfSuspect.sort_values('count',inplace=True)

In [None]:
fig = px.histogram(y=dfSuspect.index, x=dfSuspect['count']*100/sum(dfHashtags['count']), orientation='h')

fig.update_layout(title="Use of suspect hashtag (as a proportion of the total)]")
fig.update_yaxes(title="Hashtag")
fig.update_xaxes(title="Usage percent")

fig.show()

fig = px.histogram(y=dfSuspect.index, x=dfSuspect['count'], orientation='h')

fig.update_layout(title="Use of suspect hashtag (total = %d)]"%sum(dfHashtags['count']))
fig.update_yaxes(title="Hashtag")
fig.update_xaxes(title="Usage")

fig.show()

In [None]:
ut.visual_by_date_together(dfSuspect,dfUse)

In [None]:
dfUseSus = ut.process_df_uses_hashtags(df,dfSuspect.index)
dfUseSus = dfUseSus.groupby('user').any()

dfUseSus = ut.hashtagAND(dfSuspect.index,dfUseSus)

for i in range(1,len(listHashtagsStudy)):
    dfUseSus = ut.hashtagOR(dfSuspect.index,dfUseSus,'OR'+str(i),i)

dfUseSus

In [None]:
print("Number of account noVax (in dataframe noVax) that uses at least i hashtags")
for i in range (1,6):
    or_i = "OR%d"%i
    dfUseHashtagNovax = dfUseSus[dfUseSus[or_i]]
    print("\ti =",i,":\t",(len(dfUseHashtagNovax) / len(dfUseSus))*100,"%")

# Save in html

In [None]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

In [None]:
import os
if os.system("jupyter nbconvert %s --to html"%nb_name)==0:
    print("Notebook converted correctly")
else:
    print("Notebook convertion had an error")