# Test Analysis

In [1]:
import pandas as pd
import numpy as np
import json

from textblob import TextBlob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# from deepmoji.sentence_tokenizer import SentenceTokenizer
# from deepmoji.model_def import deepmoji_emojis
# from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

import re
import urllib.parse

In [4]:
dateparse = lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S%z")  # pd.datetime.strptime

tweets = pd.read_csv("../data/sample_tweets.csv", #path/to/tweets_file
                     # nrows = 400000,
                     parse_dates=['created_at'],
                     converters={"user_id": str,
                                 "tweet_id": str},
                     date_parser=dateparse,
                     lineterminator='\n')

newsguard_scores = pd.read_csv("../data/newsguard_scores.csv", #path/to/newsguard_scores_file
                              converters={"name": str},
                              lineterminator='\n')

# Converti la colonna 'score' in float, convertendo i valori non validi in NaN
newsguard_scores['score'] = pd.to_numeric(newsguard_scores['score'], errors='coerce')

  tweets = pd.read_csv("../data/sample_tweets.csv", #path/to/tweets_file


In [5]:
tweets_copy = tweets.copy()

## Preprocessing

In [8]:
# Sostituisci i valori NaN con '' nelle colonne 'text', 'retweeted_text' e 'quoted_text'
tweets_copy['text'] = tweets_copy['text'].fillna('')
tweets_copy['retweeted_text'] = tweets_copy['retweeted_text'].fillna('')
tweets_copy['quoted_text'] = tweets_copy['quoted_text'].fillna('')

# Assicurati che tutti i valori nelle colonne 'text', 'retweeted_text' e 'quoted_text' siano stringhe
tweets_copy['text'] = tweets_copy['text'].astype(str)
tweets_copy['retweeted_text'] = tweets_copy['retweeted_text'].astype(str)
tweets_copy['quoted_text'] = tweets_copy['quoted_text'].astype(str)

# Crea tre dataframe separati per 'text', 'retweeted_text' e 'quoted_text'
text_df = tweets_copy[['tweet_id', 'text']].rename(columns={'text': 'all_text'})
retweeted_text_df = tweets_copy[['tweet_id', 'retweeted_text']].rename(columns={'retweeted_text': 'all_text'})
quoted_text_df = tweets_copy[['tweet_id', 'quoted_text']].rename(columns={'quoted_text': 'all_text'})

# Concatena i dataframe
all_text_df = pd.concat([text_df, retweeted_text_df, quoted_text_df])

# Elimina le righe con all_text vuoto o nan
all_text_df = all_text_df[all_text_df['all_text'].notna() & (all_text_df['all_text'] != '')]


## Sentiment Analysis

### TextBlob

In [11]:
# Assumiamo che 'text' sia la colonna del tuo DataFrame che contiene i tweet
all_text_df['polarity'] = all_text_df['all_text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
all_text_df['subjectivity'] = all_text_df['all_text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
all_text_df


Unnamed: 0,tweet_id,all_text,polarity,subjectivity
5,1340466213240135682,@maxdantoni E questo è quello che dice il Brit...,0.0,0.0
7,1340466401111388162,🚫💉\n\n#vaccination #vaccinations #vaccinatie...,0.0,0.0
8,1340466489401503749,"Quanta confusione con i vaccini,ci sono gli ef...",0.0,0.0
11,1340466830633267205,"Covid-20, datemi un vaccino nel giro di un qua...",0.0,0.0
12,1340466917975461893,Quando stava per arrivare il vaccino ma lui va...,0.0,0.0
...,...,...,...,...
3999982,1375753714338828289,"Questa risposta da sola, all'interno delle FAQ...",0.0,0.0
3999995,1375753786413694978,NOVO MENE DAS VACINAS BRASILEIRAS https://t.co...,0.0,0.0
3999997,1375753788640931840,"""Brusaferro"":\nPerché in un'intervista al @Cor...",0.0,0.0
3999998,1375753790788362240,Draghi e Speranza motivano la scelta sulle scu...,0.0,0.0


Unnamed: 0,tweet_id,all_text,polarity,subjectivity
5,1340466213240135682,@maxdantoni E questo è quello che dice il Brit...,0.0,0.0
7,1340466401111388162,🚫💉\n\n#vaccination #vaccinations #vaccinatie...,0.0,0.0
8,1340466489401503749,"Quanta confusione con i vaccini,ci sono gli ef...",0.0,0.0
11,1340466830633267205,"Covid-20, datemi un vaccino nel giro di un qua...",0.0,0.0
12,1340466917975461893,Quando stava per arrivare il vaccino ma lui va...,0.0,0.0
...,...,...,...,...
3999982,1375753714338828289,"Questa risposta da sola, all'interno delle FAQ...",0.0,0.0
3999995,1375753786413694978,NOVO MENE DAS VACINAS BRASILEIRAS https://t.co...,0.0,0.0
3999997,1375753788640931840,"""Brusaferro"":\nPerché in un'intervista al @Cor...",0.0,0.0
3999998,1375753790788362240,Draghi e Speranza motivano la scelta sulle scu...,0.0,0.0


### Vader

In [12]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

# Calcola i punteggi di sentiment per ogni tweet
sentiment_scores = all_text_df['all_text'].apply(lambda tweet: sia.polarity_scores(tweet))

# Separare i punteggi di sentiment in colonne separate
all_text_df = pd.concat([all_text_df, sentiment_scores.apply(pd.Series)], axis=1)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\davis\AppData\Roaming\nltk_data...


In [14]:
all_text_df

Unnamed: 0,tweet_id,all_text,polarity,subjectivity,neg,neu,pos,compound
5,1340466213240135682,@maxdantoni E questo è quello che dice il Brit...,0.0,0.0,0.073,0.927,0.000,-0.4588
7,1340466401111388162,🚫💉\n\n#vaccination #vaccinations #vaccinatie...,0.0,0.0,0.000,1.000,0.000,0.0000
8,1340466489401503749,"Quanta confusione con i vaccini,ci sono gli ef...",0.0,0.0,0.000,1.000,0.000,0.0000
11,1340466830633267205,"Covid-20, datemi un vaccino nel giro di un qua...",0.0,0.0,0.000,1.000,0.000,0.0000
12,1340466917975461893,Quando stava per arrivare il vaccino ma lui va...,0.0,0.0,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...
3999982,1375753714338828289,"Questa risposta da sola, all'interno delle FAQ...",0.0,0.0,0.000,1.000,0.000,0.0000
3999995,1375753786413694978,NOVO MENE DAS VACINAS BRASILEIRAS https://t.co...,0.0,0.0,0.000,1.000,0.000,0.0000
3999997,1375753788640931840,"""Brusaferro"":\nPerché in un'intervista al @Cor...",0.0,0.0,0.000,0.936,0.064,0.3400
3999998,1375753790788362240,Draghi e Speranza motivano la scelta sulle scu...,0.0,0.0,0.000,1.000,0.000,0.0000


## Emotion Analysis

### DeepMoji

In [None]:
maxlen = 180
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_emojis(maxlen, PRETRAINED_PATH)
model.summary()

def deepmoji(texts):
    tokenized, _, _ = st.tokenize_sentences(texts)
    prob = model.predict(tokenized)
    return [np.argmax(p) for p in prob]

all_text_df['emoji'] = deepmoji(all_text_df['all_text'])

## Toxity Analysis

## Misinformation Analysis

In [16]:
# Funzione per estrarre domini dai link
def extract_domain(link_list):
    if link_list:  # Se la lista non è vuota
        link = link_list[0]
        parsed_uri = urllib.parse.urlparse(link)
        domain = '{uri.netloc}'.format(uri=parsed_uri)
        return domain
    else:  # Se la lista è vuota
        return None

# Estrai tutti i link dai tweet
all_text_df['link'] = all_text_df['all_text'].apply(lambda x: re.findall(r'(https?://[^\s]+)', x))

# Estrai il dominio da ogni link
all_text_df['domain'] = all_text_df['link'].apply(extract_domain)

# Unisci i tweet con newsguard_score in base al dominio
all_text_df = all_text_df.merge(newsguard_scores, left_on='domain', right_on='name', how='left')

all_text_df


Unnamed: 0,tweet_id,all_text,polarity,subjectivity,neg,neu,pos,compound,link,domain,name,score
0,1340466213240135682,@maxdantoni E questo è quello che dice il Brit...,0.0,0.0,0.073,0.927,0.000,-0.4588,[https://t.co/XF1BegpbKd],t.co,,
1,1340466401111388162,🚫💉\n\n#vaccination #vaccinations #vaccinatie...,0.0,0.0,0.000,1.000,0.000,0.0000,[https://t.co/2DVh4RmwVX],t.co,,
2,1340466489401503749,"Quanta confusione con i vaccini,ci sono gli ef...",0.0,0.0,0.000,1.000,0.000,0.0000,[],,,
3,1340466830633267205,"Covid-20, datemi un vaccino nel giro di un qua...",0.0,0.0,0.000,1.000,0.000,0.0000,[https://t.co/u2egYIILPp],t.co,,
4,1340466917975461893,Quando stava per arrivare il vaccino ma lui va...,0.0,0.0,0.000,1.000,0.000,0.0000,[https://t.co/G9lVkvCIC2],t.co,,
...,...,...,...,...,...,...,...,...,...,...,...,...
4640199,1375753714338828289,"Questa risposta da sola, all'interno delle FAQ...",0.0,0.0,0.000,1.000,0.000,0.0000,"[https://t.co/luwO2GBCfc, https://t.co/30NbbTF...",t.co,,
4640200,1375753786413694978,NOVO MENE DAS VACINAS BRASILEIRAS https://t.co...,0.0,0.0,0.000,1.000,0.000,0.0000,[https://t.co/5OihXD939x],t.co,,
4640201,1375753788640931840,"""Brusaferro"":\nPerché in un'intervista al @Cor...",0.0,0.0,0.000,0.936,0.064,0.3400,[https://t.co/n4sfzBojMv],t.co,,
4640202,1375753790788362240,Draghi e Speranza motivano la scelta sulle scu...,0.0,0.0,0.000,1.000,0.000,0.0000,[],,,
