# Study usage of suspect hashtag

## Import

In [None]:
import pandas as pd
import json
import numpy as np
from itertools import repeat

from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

from concurrent.futures import wait as futures_wait
from concurrent.futures.process import ProcessPoolExecutor

import importlib
import hashtag_util as ut

## Dataset preparation

In [None]:
df = pd.read_csv('../tweets.csv',low_memory=False,
                 usecols=['hashtags','user_screen_name','created_at','user_created_at','user_code_error'])

df

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")

In [None]:
listHashtagsStudy = ['5g','billgates','dittatura*.','.*disobbedisco','nessunacorrelazione','byoblu*.']

In [None]:
importlib.reload(ut)

dfUse = ut.process_df_uses_hashtags(df,listHashtagsStudy)
dfUse

In [None]:
dfUseHashtag = dfUse.groupby('user').any()
dfUseHashtag['user_created_at'] = pd.to_datetime(df.groupby('user_screen_name').first()['user_created_at'],
                                                 format="%a %b %d %X %z %Y")
dfUseHashtag['user_error'] = df.groupby('user_screen_name').first()['user_code_error']

In [None]:
importlib.reload(ut)

dfUseHashtag = ut.hashtagAND(listHashtagsStudy,dfUseHashtag)
for i in range(1,len(listHashtagsStudy)):
    dfUseHashtag = ut.hashtagOR(listHashtagsStudy,dfUseHashtag,'OR'+str(i),i)

In [None]:
dfUseHashtag

## General study of the hashtags

In [None]:
for i in range(1,len(listHashtagsStudy)):
    print("User that have used at least %d hashtags (%i)"%(i,sum(dfUseHashtag['OR'+str(i)])))
    for u in dfUseHashtag.index:
        if dfUseHashtag['OR'+str(i)][u]:
            print("\t%s"%u)


In [None]:
fig = make_subplots(rows=1,cols=2,subplot_titles=("All account","Account that used at least one hashtag"))


fig.add_trace(go.Histogram(x=dfUseHashtag['user_created_at'], y=dfUseHashtag['OR1'], nbinsx = 100),
              row=1,col=1)
fig.add_trace(go.Histogram(x=dfUseHashtag.loc[dfUseHashtag['OR1'],'user_created_at'], 
                           y=dfUseHashtag.loc[dfUseHashtag['OR1'],'OR1'],nbinsx = 100),
             row=1,col=2)
fig.update_layout(title="Relation between use of Hashtag and the date of creation of the account",
                  xaxis_title='Date',yaxis_title='user count',showlegend=False)
fig.show()

In [None]:
fig = make_subplots(rows=1,cols=2,specs=[[{'type':'domain'}, {'type':'domain'}]])

values = [sum(dfUseHashtag['AND']), len(dfUseHashtag) - sum(dfUseHashtag['AND'])]
names = ['Yes', 'No']

fig.add_trace(go.Pie(labels = names, values = values,title="Uses of the hashtag respect all hastags (AND)",textposition='inside')
              ,row=1,col=1)

values = [sum(dfUseHashtag['OR1']), len(dfUseHashtag) - sum(dfUseHashtag['OR1'])]
fig.add_trace(go.Pie(labels = names, values = values,title="Uses of the hashtag respect all hastags (at least one)",textposition='inside')
              ,row=1,col=2)

fig.show()

In [None]:
df1 = pd.DataFrame()
df1.loc[0,'name'] = 'AND'
df1.loc[0,'numHashtags'] = sum(dfUseHashtag['AND'])
for i in range(1,len(listHashtagsStudy)):
    df1.loc[i,'name'] = 'OR'+str(i) 
    df1.loc[i,'numHashtags'] = sum(dfUseHashtag['OR'+str(i)])


fig = px.pie(df1, values='numHashtags', names='name', title='Uses of suspicious hashtags')
fig.show()

df1.loc[0,'name'] = 'None'
df1.loc[0,'numHashtags'] = len(dfUseHashtag) - sum(dfUseHashtag['OR1'])

fig = px.pie(df1, values='numHashtags', names='name', title='Uses of suspicious hashtags')
fig.show()

In [None]:
for i,h in enumerate(listHashtagsStudy):
    df1.loc[i,'name'] = h
    df1.loc[i,'numHashtags'] = sum(dfUseHashtag[h])

fig = px.pie(df1, values='numHashtags', names='name', title='Uses of suspicious hashtags')
fig.update_traces(textinfo='percent+value')
fig.show()

In [None]:
#Error code of user that uses this hashtag
print ("%d users have used a suspicious hashtag and have been suspended"
       %len(dfUseHashtag.loc[(dfUseHashtag['user_error']==63) & (dfUseHashtag['OR1']==True)]))
dfUseHashtag.loc[(dfUseHashtag['user_error']==63) & (dfUseHashtag['OR1']==True)]

## Comparison with noVax users

In [None]:
dfNovax = pd.read_csv('../tweets_novax.csv',low_memory=False, usecols=['user_screen_name'])
user_novax = list(dfNovax.groupby('user_screen_name').first().index)

In [None]:
print("Number of account noVax (in dataframe noVax) that uses at least i hashtags")
for i in range (1,6):
    or_i = "OR%d"%i
    df1 = dfUseHashtag[dfUseHashtag[or_i]]
    dfUseHashtagNovax = df1[df1.index.isin(user_novax)]
    print("\ti =",i,":\t",(len(dfUseHashtagNovax) / len(df1))*100,"%")

In [None]:
dfUseByDate = ut.process_dfUse(df)
dfUseByDate

In [None]:
import re
l = []
for h in dfUseByDate['hashtag'].unique():
    for r in listHashtagsStudy:
        if re.match(r,h):
            l.append(h)
l = list(set(l))

In [None]:
ut.visual_by_date_together(pd.DataFrame(l).groupby(0).count(),dfUseByDate,50)

# Save in html

In [None]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

In [None]:
import os
if os.system("jupyter nbconvert %s --to html"%nb_name)==0:
    print("Notebook converted correctly")
else:
    print("Notebook convertion had an error")