## Import

In [None]:
import pandas as pd
import glob
import os

from itertools import takewhile, dropwhile
from collections import Counter

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import importlib
import dataframes_util as ut

## General information definition

In [None]:
importlib.reload(ut)
datapath_get = "../../data"
datapath_set = ".."

# Post dataframe

## Concatenate all dataframes

In [None]:
dfPosts = ut.read_all_csv(datapath_get+"/**/tweets.csv")

In [None]:
dfPosts.info()

## Adding features to post dataframe

### Adding info about user error

In [None]:
dfUsersErrors = ut.read_all_csv(datapath_get+"/**/not_found_users/users_not_found_inspected.csv")
dfPosts['user_code_error'] = float('nan')

for i,r in dfUsersErrors.iterrows():
    mask = dfPosts['user_id'] == r['id']
    dfPosts.loc[mask,'user_code_error'] = r['code']

In [None]:
dfPosts

### Adding info about self-retweet

In [None]:
dfPosts['is_self_rt'] = dfPosts['user_id'] == dfPosts['rt_user_id']

### Adding info about url credibility 

In [None]:
dfUsersCredibility = pd.read_csv(datapath_get+"../result_cred_score_thresh_10_user.csv",
                                 usecols=['user_id','conspiracy_score'])
dfPosts['user_credibility'] = float('nan')

for i,r in dfUsersCredibility.iterrows():
    mask = dfPosts['user_id'] == r['user_id']
    dfPosts.loc[mask,'user_credibility'] = 1 - r['conspiracy_score']

In [None]:
dfPosts

# Users dataframe

In [None]:
dfUsers = ut.read_all_csv(datapath_get+"/**/not_found_users/users_not_found.csv")
dfUsers.drop_duplicates(subset=['id'],inplace=True)

In [None]:
dfUsers.info()

## Control user

### Control if all the users have a code error

In [None]:
data_folders = glob.glob(datapath + "/**")
for folder in data_folders:
    not_matched = ut.inspect_users(folder)
    if len(not_matched) > 0:
        print(not_matched)
    else:
        print(f"{folder} is ok")

### Control if the users in post.csv and user.csv are the same

# Write to file .csv the complete dataframes

In [None]:
dfPosts.to_csv('tweets.csv', index=False)

In [None]:
dfUsers.to_csv('users.csv', index=False)

# Concatenate to .csv only specific

**Use example**

In [None]:
#concatBaseFileToFiles('tweets.csv',['data/2020-10/tweets.csv'])                           # One file
#concatBaseFileToFiles('tweets.csv',['data/2020-10/tweets.csv','data/2020-11/tweets.csv']) # Multiple file

# Create differents groups

In [None]:
dfPosts = pd.read_csv(datapath_set+'tweets.csv',low_memory=False)

## Info from retweet network

In [None]:
dfInfoRtNetwork = pd.read_csv('Group_study/retweet_network_info.csv',usecols=['high_cred','no_vax'])

### High credibility

In [None]:
listHighRtNt = list(dfInfoRtNetwork['high_cred'])
dfHighRtNt = dfPosts[dfPosts.isin(listHighRtNt)['user_screen_name']]

### Novax

In [None]:
listNovaxRtNt = list(dfInfoRtNetwork['no_vax'])
dfNovaxRtNt = dfPosts[dfPosts.isin(listNovaxRtNt)['user_screen_name']]

## Filtered by retweet

In [None]:
listSuspect = ['IacobellisT','Piero42395724', 'TommyBrain','xenonian1', 'Z3r0Rules','Pietro_Otto',
               'MarySpes','manu_etoile','CarpaneseSilva1','il_brigante07','Sakurauchi_Hime','MinervaMcGrani1',
              'marchesaangeli','lucabattanta',
              'gael99','LPinicia','Mariang47614228','SoniaLaVera','valy_s','Samira1577','daniele19921','d_essere'
              'anto_galli4','LuigiF97101292','EureosCriss','vaniacavi',
              'markred17','RenzoCianchetti','12qbert','ManuQ24916888','bisagnino','thewaterflea',
              'BarbaraRaval','noitre32','intuslegens','pbecchi','miia_2018','ladyonorato','cris_cersei','RadioSavana',
              'lameduck1960','a_meluzzi','francescatotolo','Mr_Ozymandias','FmMosca','LaVeritaWeb','Bluefidel47',
              'fdragoni','byoblu','MinutemanItaly','TarroGiulio','NicolaPorro','25O319','sabrina__sf','liliaragnar',
              'FabioFranchi1','EliseiNicole']

In [None]:
listHighCred =  ['tagadala7', 'La7tv', 'Corriere', 'tg2rai', 'TgrRaiPuglia', 'Linkiesta', 'ilfoglio_it', 
                 'fanpage', 'LaStampa', 'RaiNews', 'fattoquotidiano', 'TgrRaiVeneto', 'agorarai', 'TgrRaiSicilia', 
                 'UnioneSarda', 'SkyTG24', 'repubblica', 'TgrRai', 'Agenzia_Ansa', 'Tg3web', 'TgrRaiToscana', 
                 'Affaritaliani', 'TgrRaiTrentino', 'PiazzapulitaLA7', 'Adnkronos', 'agerpres', 'RepubblicaTv', 
                 'TgrRaiFVG', 'ilpost', 'Open_gol', 'Radio1Rai', 'MediasetTgcom24', 'TgLa7', 'RaiStudio24', 
                 'TgrRaiMolise', 'sole24ore', 'Ticinonline', 'ricpuglisi', 'SimoneCosimi', 'giusmo1', 'RobertoBurioni',
                 'rtl1025', 'messveneto', 'HuffPostItalia', 'Agenzia_Italia', 'ItaliaViva', 'iltirreno', 'Agenzia_Dire',
                 'stebellentani', 'eziomauro', 'QRepubblica', 'lucianocapone', 'robersperanza', 'vocedelpatriota', 
                 'GiovaQuez', 'Cartabellotta', 'istsupsan', 'christianrocca', 'Zeta_Luiss', 'radioanchio', '24Mattino', 
                 'sottoinchiesta', 'riotta']

In [None]:
dfInfoRtNetwork = pd.read_csv('Group_study/retweet_network_info.csv',usecols=['fake_high','fake_novax'])
listFakeNovax = list(dfInfoRtNetwork['fake_novax'])
listFakeHigh = list(dfInfoRtNetwork['fake_high'])

### High credibility

#### Retweeted by an high credibility user

#### Retweet  an high credibility user

In [None]:
#Account that have retweeted a suspect (follower)
userHighCred = listHighCred
dfRetweet = pd.DataFrame()
for i in range(0, 1):
    dfRetweet = ut.retweet_a_suspect(dfPosts,userHighCred)
    userHighCred = userHighCred + list(dfRetweet[dfRetweet['rt_rate']>=20.0].index)
    userHighCred = list(set(userHighCred))
dfRetweet

In [None]:
fig = px.histogram(x=dfRetweet[dfRetweet['rt_rate']>0]["rt_rate"],
                 histnorm='percent',title="Retweet rate on follower (high credibility)")
fig.update_xaxes(title='retweet rate')
fig.show()

In [None]:
dfHighRate = dfPosts[dfPosts.isin(userHighCred)['user_screen_name']]
#Remove fake high credibility (from retweet network study)
dfHighRate = dfHighRate[~dfHighRate.isin(listFakeHigh)['user_screen_name']]

### NoVax

#### Retweeted by a suspect
Any user that is defined as a novax that retweets a user, the retweeted user will automatically be considered a novax

#### Retweet a suspect

In [None]:
#Account that have retweeted a suspect (follower)
userNovax = listSuspect
dfRetweet = pd.DataFrame()
for i in range(0, 2):
    print ("Generation ",i, ": ",len(userNovax))
    dfRetweet = ut.retweet_a_suspect(dfPosts,userNovax)
    userNovax = userNovax + list(dfRetweet[dfRetweet['rt_rate']>=20.0].index)
    userNovax = list(set(userNovax))
dfRetweet

In [None]:
fig = px.histogram(x=dfRetweet[dfRetweet['rt_rate']>0]["rt_rate"],
                 histnorm='percent',title="Suspect rate on follower (no vax)")
fig.update_xaxes(title='retweet rate')
fig.show()

In [None]:
dfNovaxSus = dfPosts[dfPosts.isin(userNovax)['user_screen_name']]
#Remove fake high credibility (from retweet network study)
dfNovaxSus = dfNovaxSus[~dfNovaxSus.isin(listFakeNovax)['user_screen_name']]

## Filtered by hashtag

In [None]:
hashtagsNoVax = ['5g','billgates','dittatura*.','disobbedisco','nessunacorrelazione','byoblu*.']

### NoVax

In [None]:
userNoVaxHt = ut.process_df_hashtags(dfPosts,hashtagsNoVax)
userNoVaxHt

In [None]:
dfNovaxHt = dfPosts[dfPosts.isin(userNoVaxHt)['user_screen_name']]

## Filtered by URL

In [None]:
dfUrl = dfPosts.groupby('user_screen_name').first()
dfUrl.drop(dfUrl.columns.difference(['user_credibility']), 1, inplace=True)

In [None]:
fig = px.histogram(x=dfUrl['user_credibility'].dropna()[dfUrl['user_credibility']>0],nbins=100,
                 histnorm='percent',title="Suspect rate on URL shared")
fig.show()

### NoVax

In [None]:
dfUsernovaxURL = dfUrl[(dfUrl['user_credibility'].notna())&(dfUrl['user_credibility']<0.3)]
dfUsernovaxURL

#### Comparison of users found with retweets and with URLs

In [None]:
countSuspect = Counter(list(dfUsernovaxURL.index) + list(dfNovaxSus['user_screen_name'].unique()))

inTwoDf = dict(takewhile(lambda i: i[1] == 2, countSuspect.most_common()))
inOneDf = dict(dropwhile(lambda i: i[1] == 2, countSuspect.most_common()))

In [None]:
c = Counter(list(inOneDf.keys()) + list(dfUsernovaxURL.index))
suspectInUrl = dict(takewhile(lambda i: i[1] == 2, c.most_common())).keys()
(len(suspectInUrl) / len(dfUsernovaxURL)) * 100

### High credibility

In [None]:
dfUserHighURL = dfUrl[(dfUrl['user_credibility'].notna())&(dfUrl['user_credibility']<=0.2)]
dfUserHighURL

#### Comparison of users found with retweets and with URLs

In [None]:
countUser = Counter(list(dfUserHighURL.index) + list(dfHighRate['user_screen_name'].unique()))

inTwoDf = dict(takewhile(lambda i: i[1] == 2, countUser.most_common()))
inOneDf = dict(dropwhile(lambda i: i[1] == 2, countUser.most_common()))

(len(inTwoDf) / len(countUser)) * 100

In [None]:
c = Counter(list(inOneDf.keys()) + list(dfUserHighURL.index))
userInUrl = dict(takewhile(lambda i: i[1] == 2, c.most_common())).keys()
(len(userInUrl) / len(dfHighRate)) * 100

## .CSV Creation 

### Novax

In [None]:
dfNovax = pd.concat([dfNovaxSus,dfNovaxRtNt])
dfNovax.drop_duplicates(['id'],inplace=True)
dfNovax

### High credibility

In [None]:
dfHigh = pd.concat([dfHighRate,dfHighRtNt])
dfHigh.drop_duplicates(['id'],inplace=True)
dfHigh

### Other user

In [None]:
idNovax = dfNovax.groupby('id').first().index
dfOtherElem = dfPosts[dfPosts['id'].isin(idNovax) == False]

idHigh = dfHigh.groupby('id').first().index
dfOtherElem = dfOtherElem[dfOtherElem['id'].isin(idHigh) == False]

dfOtherElem

## Control if the dataframe are correctly created

In [None]:
len(dfPosts) == len(dfHigh) + len(dfNovax) + len(dfOtherElem)

In [None]:
len(dfPosts.groupby('user_screen_name').first().index) == len(dfHigh.groupby('user_screen_name').first().index) \
                                                        + len(dfNovax.groupby('user_screen_name').first().index) \
                                                        + len(dfOtherElem.groupby('user_screen_name').first().index)

### Write dataframe to .csv

In [None]:
dfOtherElem.to_csv('tweets_otherUser.csv', index=False)

In [None]:
dfHigh.to_csv('tweets_highCredibility.csv', index=False)

In [None]:
dfNovax.to_csv('tweets_novax.csv', index=False)

# Redirect of doubled classified suspect

In [None]:
doubled_class = list(np.array(userNovax)[pd.Series(userNovax).isin(userHighCred)])

#Todo with networks
dfNovax = dfNovax[dfNovax['user_screen_name'].isin(doubled_class) == False]
dfProvax = dfProvax[dfProvax['user_screen_name'].isin(doubled_class) == False]

In [None]:
pd.DataFrame(dfNovaxSus.groupby('user_screen_name').count().index).to_csv('user_novax_endorsment.csv',index=False)
pd.DataFrame(dfHighRate.groupby('user_screen_name').count().index).to_csv('user_high_endorsment.csv',index=False)