# Importazione librerie e visualizzazione Dataset

In [36]:
import pandas as pd
import networkx as nx
from itertools import combinations
import matplotlib.pyplot as plt

In [37]:
#Lettura del dataset:
df_trump = pd.read_csv("/kaggle/input/us-election-2020-tweets/hashtag_donaldtrump.csv",lineterminator='\n')
df_biden = pd.read_csv("/kaggle/input/us-election-2020-tweets/hashtag_joebiden.csv",lineterminator='\n')

In [38]:
print(f"Tweet with Trump hashtag: {len(df_trump)}")
print(f"Tweet with Biden hashtag: {len(df_biden)}")

Tweet with Trump hashtag: 970919
Tweet with Biden hashtag: 776886


In [39]:
#Dataframe unito (eliminati i duplicati)
df_duplicated = pd.concat([df_trump,df_biden])
df = df_duplicated.drop_duplicates(subset="tweet")

print(f"Total tweets: {len(df_duplicated)}")
print(f"Total tweets: {len(df)}")

Total tweets: 1747805
Total tweets: 1507205


In [40]:
df.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,1.316529e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0.0,0.0,TweetDeck,360666500.0,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,...,1860.0,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00
1,2020-10-15 00:00:01,1.316529e+18,"Usa 2020, Trump contro Facebook e Twitter: cop...",26.0,9.0,Social Mediaset,331617600.0,Tgcom24,MediasetTgcom24,Profilo ufficiale di Tgcom24: tutte le notizie...,...,1067661.0,,,,,,,,,2020-10-21 00:00:00.373216530
2,2020-10-15 00:00:02,1.316529e+18,"#Trump: As a student I used to hear for years,...",2.0,1.0,Twitter Web App,8436472.0,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",...,1185.0,Portland,45.520247,-122.674195,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060
3,2020-10-15 00:00:02,1.316529e+18,2 hours since last tweet from #Trump! Maybe he...,0.0,0.0,Trumpytweeter,8.283556e+17,Trumpytweeter,trumpytweeter,"If he doesn't tweet for some time, should we b...",...,32.0,,,,,,,,,2020-10-21 00:00:01.119649591
4,2020-10-15 00:00:08,1.316529e+18,You get a tie! And you get a tie! #Trump ‘s ra...,4.0,3.0,Twitter for iPhone,47413800.0,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",...,5393.0,Washington DC,38.894992,-77.036558,Washington,United States of America,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121


In [41]:
df.tail()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
776880,2020-11-08 23:59:37,1.325589e+18,Hypocrite!\n\n#Biden \n#Covid_19 https://t.co/...,2.0,0.0,Twitter Web App,9.583685e+17,van Lith de Jeude,LithJeude,"Stop this crazy and altruistic theory of ""We m...",...,541.0,Venus,,,,,,,,2020-11-09 18:32:45.743523
776882,2020-11-08 23:59:38,1.325589e+18,Ωχ ελπίζω να μη μας βγει σαν τους οπαδούς του...,0.0,0.0,Twitter for Android,403281900.0,οχι άλλο κάρβουνο 🇬🇷🗣🗣🗣,anapodoi,ακραία καιρικά φαινόμενα... ζήσαμε και απόψε,...,772.0,,,,,,,,,2020-11-09 18:32:45.947617
776883,2020-11-08 23:59:41,1.325589e+18,L'OTAN va sortir de sa léthargie et redevenir ...,48.0,14.0,Twitter for Android,7.819183e+17,🇫🇷 Alt-Droite (matricule 6921) ✝️ 🇬🇷 🇮🇹 🇦🇲,CtrlAltDroite,Fils de mineur. Libertés - Identité - Solidari...,...,15806.0,France,46.603354,1.888334,,France,Europe,,,2020-11-09 18:32:45.627335
776884,2020-11-08 23:59:52,1.325589e+18,🌎\n\n“#congiuntifuoriregione”\n\n‘Sono felice ...,1.0,1.0,Twitter for iPhone,529331500.0,Angelo Tani,AngeloTani,nato a casa dei nonni,...,5974.0,🌎,,,,,,,,2020-11-09 18:32:45.599846
776885,2020-11-08 23:59:58,1.325589e+18,"Ik moet zeggen dat ik #Biden ""the lesser of tw...",0.0,0.0,Twitter for Android,586386300.0,Job,_JobO__,-voeg hier uw interessante bio toe-,...,119.0,,,,,,,,,2020-11-09 18:32:45.747707


In [42]:
#Numero di utenti totali (potenziali nodi)
print(df["user_id"].value_counts())

user_id
7.426862e+07    1352
4.017365e+07    1324
1.244982e+18    1259
3.863951e+08    1223
8.742585e+08    1059
                ... 
1.318602e+18       1
1.207354e+18       1
4.701694e+08       1
1.028358e+18       1
1.295867e+18       1
Name: count, Length: 481068, dtype: int64


In [43]:
from collections import Counter
import re

def extract_hashtags(tweet):
    return re.findall(r'#\w+', tweet.lower())

df['hashtags'] = df['tweet'].apply(extract_hashtags)

all_hashtags = [hashtag for hashtags in df['hashtags'] for hashtag in hashtags]

hashtag_counts = Counter(all_hashtags)

sorted_hashtag_counts = hashtag_counts.most_common()

# Stampare la classifica degli hashtag
print("Classifica degli hashtag più usati:")
for hashtag, count in sorted_hashtag_counts[:50]:
    print(f"{hashtag}: {count}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtags'] = df['tweet'].apply(extract_hashtags)


Classifica degli hashtag più usati:
#trump: 863347
#biden: 500781
#joebiden: 295275
#election2020: 139924
#donaldtrump: 132085
#elections2020: 77590
#bidenharris2020: 69976
#trump2020: 66393
#vote: 58299
#electionday: 48413
#usa: 45016
#maga: 42982
#covid19: 38383
#kamalaharris: 37605
#biden2020: 29940
#electionnight: 27977
#uselection2020: 27542
#bidenharris: 26712
#america: 25089
#elecciones2020: 22864
#electionresults2020: 22861
#trumpmeltdown: 20640
#usaelections2020: 19958
#bidenharis2020: 19367
#debates2020: 19012
#democrats: 18316
#vote2020: 17682
#gop: 17331
#election: 16943
#coronavirus: 16796
#trumpvsbiden: 16670
#election2020results: 16606
#elections: 16043
#usaelection2020: 15554
#maga2020: 14578
#covid: 14239
#pennsylvania: 14156
#hunterbiden: 14129
#uselections2020: 14014
#2020election: 13844
#uselection: 13477
#cnn: 13441
#trumpislosing: 12883
#obama: 12785
#potus: 12186
#byebyetrump: 12113
#joebiden2020: 12076
#joebidenkamalaharris2020: 12041
#votehimout: 11950
#foxnews

Osservazioni:
- Informazioni temporali che vanno dal 15 ottobre 2020 al 8 novembre 2020.
- 481.000 potenziali nodi (filtraggio sulla base di like/retweet?)
- Tweet scritti in diverse lingue (concentrarsi solo su quelli in inglese?)
- Diversi valori mancanti nelle aree geografiche

# Preprocessing (filtraggio tweet/utenti)

Probabilmente il primo filtraggio che occorre fare è quello sulla lingua. Potrebbe essere meglio considerare solo i tweet in inglese (?)

In [44]:
#Filtraggio sulla base dei like
df_like_5 = df[df["likes"]>=5]
df_like_10 = df[df["likes"]>=10]
df_like_20 = df[df["likes"]>=20]
df_like_50 = df[df["likes"]>=50]

print(f"Total tweets: {len(df_like_5)}")
print(f"Total tweets: {len(df_like_10)}")
print(f"Total tweets: {len(df_like_20)}")
print(f"Total tweets: {len(df_like_50)}")
print(df_like_50["user_id"].value_counts())

Total tweets: 175404
Total tweets: 100234
Total tweets: 58783
Total tweets: 28596
user_id
1.232811e+08    338
7.042227e+17    245
3.968686e+08    241
2.783875e+09    234
3.924067e+07    199
               ... 
1.189810e+18      1
1.357710e+09      1
9.185330e+07      1
1.311773e+18      1
9.416288e+17      1
Name: count, Length: 10235, dtype: int64


In [45]:
#Filtraggio sulla base dei retweet
df_retweet_5 = df[df["retweet_count"]>=5]
df_retweet_10 = df[df["retweet_count"]>=10]
df_retweet_20 = df[df["retweet_count"]>=20]
df_retweet_50 = df[df["retweet_count"]>=50]

print(f"Total tweets: {len(df_retweet_5)}")
print(f"Total tweets: {len(df_retweet_10)}")
print(f"Total tweets: {len(df_retweet_20)}")
print(f"Total tweets: {len(df_retweet_50)}")
print(df_retweet_50["user_id"].value_counts())

Total tweets: 59557
Total tweets: 32206
Total tweets: 17720
Total tweets: 7765
user_id
1.214316e+18    149
2.909782e+07    134
1.232811e+08    105
1.824706e+07     99
4.990740e+08     78
               ... 
4.706692e+07      1
2.621748e+08      1
2.298251e+08      1
7.820675e+08      1
1.988165e+08      1
Name: count, Length: 2848, dtype: int64


In [46]:
#FILTRAGGIO BASATO SU paese=United states
df_country= df[df["country"]=="United States of America"]
print(f"Total tweets: {len(df_country)}")

print(df_country["user_id"].value_counts())
df_country.tail()

Total tweets: 297754
user_id
1.244982e+18    1259
8.742585e+08    1059
4.132841e+06     980
2.086079e+08     856
1.154952e+18     785
                ... 
2.171204e+08       1
1.406658e+07       1
1.446436e+08       1
3.845704e+07       1
1.071796e+18       1
Name: count, Length: 76160, dtype: int64


Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,user_location,lat,long,city,country,continent,state,state_code,collected_at,hashtags
776827,2020-11-08 23:54:14,1.325587e+18,George W. #Bush #Congratulates #Biden And Harr...,1.0,1.0,Twitter for iPhone,49388160.0,Carol Falk,CAFalk,https://t.co/uuyj7Dnata Activist: #Resistance ...,...,Wisconsin,44.430898,-89.688464,,United States of America,North America,Wisconsin,WI,2020-11-09 18:32:45.705803,"[#bush, #congratulates, #biden]"
776845,2020-11-08 23:56:15,1.325588e+18,Will #criticalRaceTheory become ubiquitous in ...,0.0,0.0,Twitter Web App,409571500.0,Howard Wachtel,mindovermath,Retired college #math professor. Single. Brid...,...,"Philadelphia, PA",39.952724,-75.163526,Philadelphia,United States of America,North America,Pennsylvania,PA,2020-11-09 18:32:45.773127,"[#criticalracetheory, #biden]"
776847,2020-11-08 23:56:21,1.325588e+18,You moving near #Biden 🤔 https://t.co/1F6i1YIJ2P,0.0,0.0,Twitter for iPhone,191460000.0,Sean Lassiter,IAmSeanLassiter,Sean Lassiter Photography,...,Philadelphia PA,39.952724,-75.163526,Philadelphia,United States of America,North America,Pennsylvania,PA,2020-11-09 18:32:45.731141,[#biden]
776865,2020-11-08 23:58:24,1.325589e+18,@FLOTUS I’m excited to have a FLOTUS whose vag...,0.0,0.0,Twitter for iPhone,55456250.0,Caroline Billinson,cbillinson,my love language is dismantling the patriarchy.,...,"Washington, DC",38.894992,-77.036558,Washington,United States of America,North America,District of Columbia,DC,2020-11-09 18:32:45.841439,[#biden]
776870,2020-11-08 23:58:48,1.325589e+18,The man needs some help...#usa #biden\nWhen wi...,0.0,0.0,Twitter for Android,1.248047e+18,Dr J,DrJoeMcCarthy,Human. Free Thinker. Met Mandela. Personal. Fa...,...,Earth. 3rd Planet from Sun.,43.51963,-114.31532,,United States of America,North America,Idaho,ID,2020-11-09 18:32:45.641087,"[#usa, #biden]"


# Classificazione preferenze (pro-Trump or pro-Biden) con llama3

In [None]:
import subprocess
import threading

!pip install langchain-community
!pip install langchain-core

#istallazione di ollama
!curl -fsSL https://ollama.com/install.sh | sh
    
#Avvio del server locale di Ollama
t = threading.Thread(target=lambda: subprocess.run(["ollama", "serve"]),daemon=True)
t.start()

!ollama pull llama3

t2 = threading.Thread(target=lambda: subprocess.run(["ollama", "run", "llama3"]),daemon=True)
t2.start()

In [None]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt = "You are a political classifier over a list of tweets about USA election. Your role is to analyze the list of tweets of users and to establish if user is Pro-Biden or Pro-Trump. Each tweet start when you read: 'TWEET START'. You must assign TO EACH tweet a class (Pro-Biden or Pro-Trump). You have to give ONLY the class for EACH tweet, NOT ANYMORE. The class for each tweet must be separated by a comma. If a tweet has offensive language, ignore it and predict the class for this tweet as 'X'."

llm = Ollama(
    model="llama3"
)  # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3 `

template = ChatPromptTemplate.from_messages([
    ("system", prompt),
    ("user", "{input}"),
])

output_parser = StrOutputParser()


def preference_llama(tweet):   
    #chain = template | llm | output_parser
    
    #response = chain.invoke({"input": "Tweet 1:" +tweet1+ ". Tweet 2:" +tweet2})
    response = llm.invoke(prompt + "Tweet:" + tweet)
    
    return response

In [None]:
import json
import time

# Specifica il nome del file JSON
filename = '/kaggle/working/preferences.json'
records = []

for index, row in df_sampled.iterrows(): 
    resp = preference_llama(row.tweet)
    print(resp)
    record = {
        "user": row.user_screen_name,
        "class": resp
    }
    records.append(record)

"""
#prova con lista di tweet
tweet_list=[]
counter=0
max_list=4
for index, row in df_sampled.iterrows(): 
    #print(row)
    tweet_list.append(row)
    if counter<max_list-1:
        counter=counter+1
    else:
        counter2=0
        tweets="TWEET START: "
        for row in tweet_list:
            counter2=counter2+1
            if(counter2==counter):
                tweets=tweets+row.tweet+"."
            else:
                tweets=tweets+row.tweet+". TWEET START:"
        resp = preference_llama(tweets,prompt)
        print(resp)
        for row in tweet_list:
            record = {
                "user": row.user_screen_name,
                "class": resp
            }
            records.append(record)
        counter=0
        tweet_list.clear()
"""
    
# Scrivi i dati nel file JSON
with open(filename, 'w') as file:
    json.dump(records, file)

# Costruisco la rete di similarità con gli hashtag

Si pone il seguente problema: potrebbe non essere la scelta giusta andare a escludere utenti per numero di followers. Da un lato potremmo escludere il comportamento tipico degli utenti meno popolari, che sono anche quelli più numerosi (le persone comuni, che poi di fatto vanno a votare), dall'altro potremmo escludere il ruolo di utenti più popolari in grado di influenzare maggiormente gli altri utenti. Potremmo pensare di effettuare un campionamento casuale dei nodi per ridurre la dimensione della rete. Oppure dovremmo pensare al filtraggio sotto altri metodi (numero di like o retweet?). Potremmo fare anche un campionamento che si basa sulla degree distribution. Probabilmente la cosa migliore è andare a fare un campionamento casuale direttamente sul dataset.

Provo invece a considerare i top 100 e i last 1000.

In [47]:
#raggruppo solo per followers
grouped_followers = df_country.groupby('user_screen_name').agg({'user_followers_count':'first'}).reset_index()
print(f"Total tweets after concate: {len(grouped_followers)}")
print(grouped_followers["user_screen_name"].value_counts())

#raggruppo per tweets
grouped_df = df_country.groupby('user_screen_name')['tweet'].apply(lambda tweets: ' '.join(tweets)).reset_index()
print(f"Total tweets after concate: {len(grouped_df)}")
print(grouped_df["user_screen_name"].value_counts())

#grouped_followers.head()
#grouped_df.head()

Total tweets after concate: 76279
user_screen_name
zzz_ooo_eee        1
000HMY             1
001Newway          1
007442008OB        1
007__NIL           1
                  ..
0amaam             1
0bzerve            1
0ch0a21            1
0fficiallyJoee_    1
0hGood4U           1
Name: count, Length: 76279, dtype: int64
Total tweets after concate: 76279
user_screen_name
zzz_ooo_eee        1
000HMY             1
001Newway          1
007442008OB        1
007__NIL           1
                  ..
0amaam             1
0bzerve            1
0ch0a21            1
0fficiallyJoee_    1
0hGood4U           1
Name: count, Length: 76279, dtype: int64


In [48]:
#faccio la join per tenere numero di followers e tweets
df_join = pd.merge(grouped_df, grouped_followers, on="user_screen_name", how="inner")
print(df_join["user_screen_name"].value_counts())
df_join.head()

user_screen_name
zzz_ooo_eee        1
000HMY             1
001Newway          1
007442008OB        1
007__NIL           1
                  ..
0amaam             1
0bzerve            1
0ch0a21            1
0fficiallyJoee_    1
0hGood4U           1
Name: count, Length: 76279, dtype: int64


Unnamed: 0,user_screen_name,tweet,user_followers_count
0,000HMY,#democracy is our lives\n#VOTE\nnot a #tRump r...,42.0
1,001Newway,#TheFix #Scarface #2020Election #bidenharis202...,22.0
2,007442008OB,@realDonaldTrump Report this tweet!! ⛔️WARNING...,56.0
3,007__NIL,@CatEyezGreen_ @nudog71 @pressec We need the ...,6196.0
4,007mdb,Joe Biden $ Hunter Biden: Biden lied about his...,50.0


In [49]:
#calcolo gli hashtags usati per ogni utente

# Funzione per estrarre gli hashtag da un tweet
def extract_hashtags(tweet):
    return re.findall(r'#\w+', tweet.lower())

# Aggiungere una colonna con gli hashtag estratti
user_hashtags = df_join.copy()
user_hashtags['hashtags'] = df_join['tweet'].apply(extract_hashtags)

# Trasformo la lista di tweet in un insieme (per non avere duplicati)
user_hashtags["hashtags"] = user_hashtags['hashtags'].apply(set)

print(len(user_hashtags))
print(user_hashtags["user_screen_name"].value_counts())
user_hashtags.head()

76279
user_screen_name
zzz_ooo_eee        1
000HMY             1
001Newway          1
007442008OB        1
007__NIL           1
                  ..
0amaam             1
0bzerve            1
0ch0a21            1
0fficiallyJoee_    1
0hGood4U           1
Name: count, Length: 76279, dtype: int64


Unnamed: 0,user_screen_name,tweet,user_followers_count,hashtags
0,000HMY,#democracy is our lives\n#VOTE\nnot a #tRump r...,42.0,"{#democracy, #trump, #covid, #vote, #president..."
1,001Newway,#TheFix #Scarface #2020Election #bidenharis202...,22.0,"{#bidenharis2020, #thefix, #2020election, #bid..."
2,007442008OB,@realDonaldTrump Report this tweet!! ⛔️WARNING...,56.0,"{#bidenharris2020, #corona, #bidenharris2020to..."
3,007__NIL,@CatEyezGreen_ @nudog71 @pressec We need the ...,6196.0,{#joebiden}
4,007mdb,Joe Biden $ Hunter Biden: Biden lied about his...,50.0,"{#voting, #biden, #realdonaldtrump}"


In [50]:
#prendo i top 100
df_sorted = user_hashtags.sort_values(by='user_followers_count', ascending=False)

top_100 = df_sorted.head(100)

#prendo tutti gli utenti con meno di 1000 followers
df_less_than = df_sorted[df_sorted["user_followers_count"]<1000]
print(df_less_than["user_screen_name"].value_counts())

#campiono 1000 utenti non popolari
last_1000 = df_less_than.sample(n=1000, random_state=42) 
print(last_1000["user_screen_name"].value_counts())

#dataframe uniti
total = pd.concat([top_100,last_1000],axis=0)
print(total["user_screen_name"].value_counts())

user_screen_name
ValTaube          1
MikeGRiggins      1
P1OTUS            1
Outdoorman402     1
surekels          1
                 ..
UICProfWatch      1
laabdog           1
firesign68        1
FekaduShibeshi    1
WeCanWeWill100    1
Name: count, Length: 56546, dtype: int64
user_screen_name
JulianneCalapa     1
betoenlaradio      1
esamboraey         1
FatFloridaJesu1    1
itsmegkent         1
                  ..
OMARIBRAHIIMMM     1
Scholar76974781    1
annakajidancer     1
rckinrbin          1
thriftbookish      1
Name: count, Length: 1000, dtype: int64
user_screen_name
JulianneCalapa    1
PerezHilton       1
common            1
danawhite         1
USATODAY          1
                 ..
GuyKawasaki       1
SaraCarterDC      1
HOT97             1
TheDeenShow       1
dumbassgenius     1
Name: count, Length: 1100, dtype: int64


### Calcolo della similarità tra gli hashtags usando Llama3

In [None]:
import subprocess
import threading

!pip install langchain-community
!pip install langchain-core

#istallazione di ollama
!curl -fsSL https://ollama.com/install.sh | sh
    
#Avvio del server locale di Ollama
t = threading.Thread(target=lambda: subprocess.run(["ollama", "serve"]),daemon=True)
t.start()

!ollama pull llama3

t2 = threading.Thread(target=lambda: subprocess.run(["ollama", "run", "llama3"]),daemon=True)
t2.start()

Valutare il miglior prompt.

In [None]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

#Va bene il prompt? Valuta
prompt = "You are an hashtags evaluator. Your role is to analyze two groups of hashtags and group classify the two groups as SIMILAR or NOT SIMILAR. You have to answer ONLY with the class (SIMILAR or NOT SIMILAR), not anymore."

llm = Ollama(
    model="llama3"
)  # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3 `

template = ChatPromptTemplate.from_messages([
    ("system", prompt),
    ("user", "{input}"),
])

output_parser = StrOutputParser()


def hashtags_to_llama(hashtags1,hashtags2):   
    #chain = template | llm | output_parser
    
    #response = chain.invoke({"input": "Tweet 1:" +tweet1+ ". Tweet 2:" +tweet2})
    response = llm.invoke(prompt + "Hashtags 1:" + hashtags1 + ". Hashtags 2:" + hashtags2)
    
    return response

Ora costruisco le 5 partizioni per calcolare le similarità tra gli utenti. Le 5 partizioni sono così costituite: ognuna include 20 nodi top, i quali necessitano il calcolo della similarità con i last_1000. 
Serve un'ultimo calcolo interno tra i 100 nodi top.

In [51]:
#costruisco partizione su top_100
import numpy as np
partitions = np.array_split(top_100, 5)

first_part = partitions[0]
second_part = partitions[1]
third_part = partitions[2]
fourth_part = partitions[3]
fifth_part = partitions[4]

first_part.head()

  return bound(*args, **kwds)


Unnamed: 0,user_screen_name,tweet,user_followers_count,hashtags
30861,PerezHilton,"Without a doubt, the #FourSeasons debacle is t...",5747472.0,"{#bradpitt, #joebiden, #donaldtrump, #kamalaha..."
49437,common,While we were campaigning across battleground ...,5477365.0,{#joebiden}
50178,danawhite,The big fight TODAY!!!!! @realDonaldTrump vs #...,5476752.0,"{#merica, #joebiden}"
41016,USATODAY,"President Donald #Trump’s youngest daughter, T...",4163175.0,"{#lgbtq, #trump}"
2293,Alyssa_Milano,Woah. Have you read this article?\n\nAll the p...,3750110.0,"{#followthemoney, #trump, #bidenharristosaveam..."


In [None]:
df = first_part.drop(columns=["user_followers_count", "tweet"]).merge(last_1000.drop(columns=["user_followers_count", "tweet"]), how='cross')
df.head()    

In [None]:
import json
from tqdm import tqdm

partition = first_part #second_part #third_part #fourth_part #fifth_part
# Specifica il nome del file JSON
filename = f'/kaggle/working/similarities_{partition}.json'
records = []

#aggiusta qui! non va bene df
df = partition.drop(columns=["user_followers_count", "tweet"]).merge(last_1000.drop(columns=["user_followers_count", "tweet"]), how='cross')
for index, row in tqdm(df.iterrows()):
    resp = hashtags_to_llama(str(row.hashtags_x),str(row.hashtags_y))
    #print(resp)
    record = {
        "user1": row.user_screen_name_x,
        "user2": row.user_screen_name_y,
        "similarity": resp
    }
    print(record)
    records.append(record)
    
# Scrivi i dati nel file JSON
with open(filename, 'w') as file:
    json.dump(records, file, indent=4)

Costruisco la rete con NetworkX:

In [None]:
#Leggo il file json

# Specifica il nome del file JSON
filename = '/kaggle/working/similarities.json'

# Carica i dati dal file JSON
data = load_json(filename)

In [None]:
# Creare un grafo vuoto
G = nx.Graph()

# Itera su ogni record nel file JSON
for item in data:
    dictionary = dict(item.items())
    if not G.has_node(dictionary["user1"]): #se utente non presente, lo aggiungo alla rete
        G.add_node(dictionary["user1"])
    if not G.has_node(dictionary["user2"]): #se utente non presente, lo aggiungo alla rete
        G.add_node(dictionary["user2"])
    if float(dictionary["similarity"])>Threshold: #Oppure devo fare il confronto sulla classificazione (SIMILAR or NOT SIMILAR)
        G.add_edge(dictionary["user1"], dictionary["user2"], weight=float(dictionary["similarity"]))
    

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [None]:
# Disegnare il grafo (non si capisce niente, troppi nodi dentro la rete)

pos = nx.spring_layout(G)  # Posizionamento dei nodi
weights = nx.get_edge_attributes(G, 'weight').values()

nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=5, font_size=5, font_weight='bold')
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): f'{d["weight"]:.2f}' for u, v, d in G.edges(data=True)}, font_color='red')
nx.draw_networkx_edges(G, pos, width=list(weights))

plt.show()

In [None]:
# Plot della degree distribution

# Calcolare i gradi dei nodi
degrees = [degree for node, degree in G.degree()]

# Calcolare la distribuzione dei gradi
degree_count = Counter(degrees)
deg, cnt = zip(*degree_count.items())

# Fare il plot della distribuzione dei gradi
plt.figure(figsize=(8, 6))
plt.bar(deg, cnt, width=10, color='b')

plt.title("Degree Distribution")
plt.xlabel("Degree")
plt.ylabel("Frequency")

plt.show()

In [None]:
# Plot della weighted degree

# Calcolare il weighted degree dei nodi
weighted_degrees = dict(G.degree(weight='weight'))

# Calcolare la distribuzione del weighted degree
weighted_degree_count = Counter(weighted_degrees.values())
deg, cnt = zip(*weighted_degree_count.items())

# Fare il plot della distribuzione del weighted degree
plt.figure(figsize=(8, 6))
plt.bar(deg, cnt, width=10, color='b')

plt.title("Weighted Degree Distribution")
plt.xlabel("Weighted Degree")
plt.ylabel("Frequency")

plt.show()

In [None]:
# Degree Centrality
degree_centrality = nx.degree_centrality(G)

# Ordiniamo i nodi in base ai valori di degree centrality in ordine decrescente
sorted_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)

# Stampiamo i nodi con i valori più alti di degree centrality
for node, centrality in sorted_degree[:10]: #stampo solo i migliori 10
    print(f'Nodo: {node}, Degree Centrality: {centrality:.6f}')
    
    
#Closeness 
closeness_centrality = nx.closeness_centrality(G)

# Ordiniamo i nodi in base ai valori di degree centrality in ordine decrescente
sorted_degree = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)

# Stampiamo i nodi con i valori più alti di degree centrality
for node, centrality in sorted_degree[:10]: #stampo solo i migliori 10
    print(f'Nodo: {node}, Closeness Centrality: {centrality:.6f}')

### Community detection sulla rete di similarità per scoprire topic comuni

In [None]:
import community as community_louvain

# Eseguire la community detection usando l'algoritmo di Louvain
partition = community_louvain.best_partition(G)

"""
# Disegnare il grafo con le comunità
pos = nx.spring_layout(G)
cmap = plt.get_cmap('viridis')
nx.draw_networkx_nodes(G, pos, node_size=5, cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()
"""

In [None]:
communities = {}
for node, community in partition.items():
    if community not in communities:
        communities[community] = []
    communities[community].append(node)

"""
for community, nodes in communities.items():
    print(f"Community {community}:")
    print(", ".join(nodes))
"""


#vedo le community che hanno almeno 10 nodi:

# Filtrare le comunità che hanno almeno 10 nodi
large_communities = {community: nodes for community, nodes in communities.items() if len(nodes) >= 10}

# Stampare il nome dei nodi di ogni comunità con almeno 10 nodi
for community, nodes in large_communities.items():
    print(f"Community {community} (size: {len(nodes)}):")
    print(", ".join(nodes))
    print()

In [None]:
#Studiamo gli hashtags più frequenti per ogni community più numerosa

for community, nodes in large_communities.items():
    print(f"Community: {community}")
    print(f"Num nodes: {len(nodes)}")

    df_comm = user_hashtags[user_hashtags["user_screen_name"].isin(nodes)]
    print(df_comm["user_screen_name"].value_counts())

    all_hashtags = [hashtag for hashtags in df_comm['hashtags'] for hashtag in hashtags]

    hashtag_counts = Counter(all_hashtags)

    sorted_hashtag_counts = hashtag_counts.most_common()

    # Stampare la classifica degli hashtag
    print("Classifica degli hashtag più usati:")
    for hashtag, count in sorted_hashtag_counts[:50]:
        print(f"{hashtag}: {count}")
    print()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Rimozione di URL, menzioni e hashtag
    text = re.sub(r"http\S+|@\S+|#\S+", "", text)
    # Rimozione di punteggiatura e numeri
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # Convertire il testo in minuscolo
    text = text.lower()
    # Tokenizzazione
    tokens = word_tokenize(text)
    # Rimozione delle stopword
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

# Analizzare le comunità per determinare i topic
for community, nodes in large_communities.items():
    all_words = []
    for node in nodes:
        all_words.extend(preprocess_text(user_hashtags.loc[user_hashtags["user_screen_name"]==node, 'tweet'].values[0]))
    word_counts = Counter(all_words)
    most_common_words = word_counts.most_common(30)
    print(f"Community {community} (size: {len(nodes)}):")
    print("Most common words:", most_common_words)
    print()

In [None]:
# Effettuo un campionamento casuale del dataset (gli utenti sono troppi e non riusciremmo a costruire la rete)

"""
df_sampled = grouped_conc.sample(frac=0.2, random_state=42)
print(df_sampled["user_screen_name"].value_counts())

df_sampled.head()
# Idea di altro campionamento: 
# stimo i degree in modo parallelo (calcolo similarità dei primi 100 utenti con tutti gli altri)
# campiono seguendo la stima della distribuzione
"""

Calcolo classico della similarità con Jaccard non va bene. Quindi commento il codice:

In [None]:
"""
Threshold = 0.3

# Funzione per calcolare la similarità di Jaccard
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0
    return intersection / union

df_final = user_hashtags.drop(columns=["tweet"])
# Calcolare la similarità di Jaccard tra ogni coppia di utenti
edges = []
for (user1, hashtags1), (user2, hashtags2) in combinations(user_hashtags.drop(columns=["tweet"]).itertuples(index=False), 2):
    similarity = jaccard_similarity(hashtags1, hashtags2)
    if similarity > Threshold:  # Aggiungere solo archi con similarità positiva
        edges.append((user1, user2, similarity))

# Creare un grafo vuoto
G = nx.Graph()

# Aggiungere nodi (utenti)
for user in df_final['user_screen_name']: 
    G.add_node(user)

# Aggiungere archi con pesi (similarità di Jaccard)
for user1, user2, weight in edges:
    G.add_edge(user1, user2, weight=weight)
    

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
"""

In [None]:
"""
#Prendiamo gli utenti che hanno degree 998.

desired_degree = 767 #950, #767, #998

# Filtrare i nodi che hanno il grado specificato
nodes_with_desired_degree = [node for node, degree in degree_dict.items() if degree == desired_degree]

df_giant = df_final[df_final["user_screen_name"].isin(nodes_with_desired_degree)]
# Stampare i nodi con il grado desiderato
print(f"Nodi con grado {df_giant}:")
print(df_giant)
"""

Si potrebbe effettuare un campionamento dei nodi tenendo conto della degree distribution dei nodi

In [None]:
# Definire la funzione di campionamento basato sui gradi
"""
def degree_based_sampling(graph, sample_size):
    # Calcolare i gradi dei nodi
    degrees = dict(graph.degree())
    nodes, degree_values = zip(*degrees.items())
    
    # Convertire i gradi in probabilità (più alto il grado, maggiore la probabilità di essere selezionato)
    total_degree = sum(degree_values)
    probabilities = [degree / total_degree for degree in degree_values]
    
    # Campionare i nodi in base alle probabilità
    sampled_nodes = np.random.choice(nodes, size=sample_size, replace=False, p=probabilities)
    
    # Restituire il sottografo campionato
    return graph.subgraph(sampled_nodes)

# Campionare il 20% dei nodi basato sui gradi
sample_size = int(len(G.nodes) * 0.2)
G_sampled = degree_based_sampling(G, sample_size)

# Calcolare la distribuzione dei gradi nel grafo campionato
sampled_degrees = [degree for node, degree in G_sampled.degree()]
sampled_degree_count = Counter(sampled_degrees)
sampled_deg, sampled_cnt = zip(*sampled_degree_count.items())

# Fare il plot della distribuzione dei gradi nel grafo campionato
plt.figure(figsize=(8, 6))
plt.bar(sampled_deg, sampled_cnt, width=0.80, color='b')

plt.title("Degree Distribution in Sampled Graph")
plt.xlabel("Degree")
plt.ylabel("Frequency")
plt.show()

# Fare il plot della distribuzione dei gradi nel grafo originale per confronto
original_degrees = [degree for node, degree in G.degree()]
original_degree_count = Counter(original_degrees)
orig_deg, orig_cnt = zip(*original_degree_count.items())

plt.figure(figsize=(8, 6))
plt.bar(orig_deg, orig_cnt, width=0.80, color='r')

plt.title("Degree Distribution in Original Graph")
plt.xlabel("Degree")
plt.ylabel("Frequency")
plt.show()
"""

# Costruisco la rete con similarità usando Sentence Bert

Uso i 100 utenti più popolari (in termini di followers) e campiono 1000 utenti non popolari (sotto i 1000 followers).

Utilizzo pipeline transformers per filtrare tutti i tweet che non sono in inglese. (Non funziona correttamente!)

In [None]:
"""
from transformers import pipeline
import torch
from tqdm import tqdm

device = 0 if torch.cuda.is_available() else -1

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

filename="/kaggle/working/user_to_filter.json"
user_to_filter = []

for row in tqdm(df_sampled.itertuples(index=True, name='Pandas')):
    candidate_labels = ['english language', 'not english language']
    resp = classifier(row.tweet, candidate_labels)["labels"][0]
    print(row.tweet)
    print(resp)
    if resp == "not english language":
        record = {
            "user": row.user_screen_name
        }
        user_to_filter.append(record)

    
# Scrivi i dati nel file JSON
with open(filename, 'w') as file:
    json.dump(user_to_filter, file)

#crea nuovo df leggendo json con utenti da eliminare

# Funzione per caricare il contenuto di un file JSON
def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

# Specifica il nome del file JSON
filename="/kaggle/working/user_to_filter.json"

# Carica i dati dal file JSON
data = load_json(filename)
user_to_filter = []

# Itera su ogni record nel file JSON
for item in data:
    dictionary = dict(item.items())
    user_to_filter.append(dictionary["user"])

# Elimino da grouped_df gli utenti che non hanno tweet in inglese
indexes = grouped_df[gouped_df['user_screen_name'].isin(user_to_filter)].index

# Eliminare le righe usando il metodo drop
df_filtered = grouped_df.drop(indexes)

print(f"Users before filter: {len(grouped_df)}")
print(f"Users after filter: {len(df_filtered)}")

"""


### Summarization con t5

Utilizzo pipeline per la summarization per testi troppo lunghi. Problematica, alcuni testi sono eccessivamente lunghi e il modello va out of memory. Soluzione: tronco l'input.

In [None]:
from transformers import pipeline
import torch
from tqdm import tqdm
import json

device = 0 if torch.cuda.is_available() else -1

summarizer = pipeline(task="summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", device=device)

#3000 non va bene, probabilmente occorre abbassarla ulteriormente
Threshold = 3000 #soglia sul numero di caratteri, se viene superata questa soglia, il testo viene riassunto

filename="/kaggle/working/summarization.json"
summarized = []

for row in tqdm(total.itertuples(index=True, name='Pandas')): #df_filtered
    if (len(row.tweet)>Threshold):
        text = row.tweet
        if (len(row.tweet)>10000): #se il testo è oltre i 10.000 caratteri, lo tronco
            text = text[:10000]
        #print(text)
        resp = summarizer(text)
        #print(resp)
        record = {
            "user": row.user_screen_name,
            "summerized": resp[0]["summary_text"]
        }
        summarized.append(record)

    
# Scrivi i dati nel file JSON
with open(filename, 'w') as file:
    json.dump(summarized, file, indent=4)

In [None]:
# Codice per sostituire i tweet con i riassunti
import json
Threshold=3000

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

# Specifica il nome del file JSON
filename="/kaggle/working/summarization.json"

# Carica i dati dal file JSON
data = load_json(filename)

# Itera su ogni record nel file JSON
for item in data:
    dictionary = dict(item.items())
    total.loc[total['user_screen_name'] == dictionary["user"], 'tweet'] = dictionary["summerized"]
    top_100.loc[top_100['user_screen_name'] == dictionary["user"], 'tweet'] = dictionary["summerized"]
    last_1000.loc[last_1000['user_screen_name'] == dictionary["user"], 'tweet'] = dictionary["summerized"]

#controllo per vedere se sono rimasti tweet con più di 3000 caratteri    
for row in total.itertuples(index=True, name='Pandas'):
    if (len(row.tweet)>Threshold):
        print("Tweet con più di 3000 caratteri")

Ora costruisco le 5 partizioni per calcolare le similarità tra gli utenti. Le 5 partizioni sono così costituite: ognuna include 20 nodi top, i quali necessitano il calcolo della similarità con i last_1000. 
Serve un'ultimo calcolo interno tra i 100 nodi top.


In [None]:
#costruisco partizione su top_100
import numpy as np
partitions = np.array_split(top_100, 5)

first_part = partitions[0]
second_part = partitions[1]
third_part = partitions[2]
fourth_part = partitions[3]
fifth_part = partitions[4]

first_part.head()

In [53]:
"""
import itertools
lista = list(itertools.product(first_part.drop(columns=["user_followers_count"]).iterrows(), last_1000.drop(columns=["user_followers_count"]).iterrows()))
for i in lista:
    print(i[0][1])
    print(i[1])
    break
"""

partition = first_part #second_part #third_part #fourth_part #fifth_part

# Costruzione di tutte le combinazioni
df = partition.drop(columns=["user_followers_count","hashtags"]).merge(last_1000.drop(columns=["user_followers_count", "hashtags"]), how='cross')
df.head()    

Unnamed: 0,user_screen_name_x,tweet_x,user_screen_name_y,tweet_y
0,PerezHilton,"Without a doubt, the #FourSeasons debacle is t...",betoenlaradio,FALTAN 4 DIAS PARA LA #ELECCION2020 #TRUMP VS ...
1,PerezHilton,"Without a doubt, the #FourSeasons debacle is t...",esamboraey,"Inshallah, someday I can tell my kids #Trump l..."
2,PerezHilton,"Without a doubt, the #FourSeasons debacle is t...",FatFloridaJesu1,Sniffy Joe thinks he's in the Lincoln/Douglas ...
3,PerezHilton,"Without a doubt, the #FourSeasons debacle is t...",itsmegkent,Whoever is doing #DonaldTrump Spray Tan should...
4,PerezHilton,"Without a doubt, the #FourSeasons debacle is t...",dennis_hohman,https://t.co/adMdR7ZwH1 #HunterBiden #JoeBide...


### SentenceBert

In [55]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


In [56]:
from sentence_transformers import SentenceTransformer
import json
from tqdm import tqdm

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Specifica il nome del file JSON
filename = f'/kaggle/working/similarities_{partition}.json'
records = []

for index, row in tqdm(df.iterrows()):
    embeddings = model.encode([row.tweet_x,row.tweet_y])
    print(embeddings.shape)
    similarities = model.similarity(embeddings, embeddings)
    print(similarities)
    resp = similiraties[0][1][1]
    record = {
        "user1": row.user_screen_name_x,
        "user2": row.user_screen_name_y,
        "similarity": resp
    }
    print(record)
    records.append(record)
    
# Scrivi i dati nel file JSON
with open(filename, 'w') as file:
    json.dump(records, file, indent=4)

  from tqdm.autonotebook import tqdm, trange
2024-07-15 13:50:36.862827: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-15 13:50:36.862957: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-15 13:50:36.997458: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


KeyboardInterrupt: 

In [None]:
# Specifica il nome del file JSON
filename = '/kaggle/working/similarities.json'

# Carica i dati dal file JSON
data = load_json(filename)

In [None]:
# Creare un grafo vuoto
G = nx.Graph()

Threshold = 0 #threshold similarità

# Itera su ogni record nel file JSON
for item in data:
    dictionary = dict(item.items())
    if not G.has_node(dictionary["user1"]): #se utente non presente, lo aggiungo alla rete
        G.add_node(dictionary["user1"])
    if not G.has_node(dictionary["user2"]): #se utente non presente, lo aggiungo alla rete
        G.add_node(dictionary["user2"])
    if float(dictionary["similarity"])>Threshold:
        G.add_edge(dictionary["user1"], dictionary["user2"], weight=float(dictionary["similarity"]))
    

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [None]:
# Disegnare il grafo
pos = nx.spring_layout(G)  # Posizionamento dei nodi
weights = nx.get_edge_attributes(G, 'weight').values()

nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=5, font_size=5, font_weight='bold')
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): f'{d["weight"]:.2f}' for u, v, d in G.edges(data=True)}, font_color='red')
nx.draw_networkx_edges(G, pos, width=list(weights))

plt.show()

Questa parte del codice prevedeva l'uso di Llama3, ma abbiamo visto non essere efficace in questo senso, quindi iul codice è stato commentato.

In [None]:
"""
import subprocess
import threading

!pip install langchain-community
!pip install langchain-core

#istallazione di ollama
!curl -fsSL https://ollama.com/install.sh | sh
    
#Avvio del server locale di Ollama
t = threading.Thread(target=lambda: subprocess.run(["ollama", "serve"]),daemon=True)
t.start()

!ollama pull llama3

t2 = threading.Thread(target=lambda: subprocess.run(["ollama", "run", "llama3"]),daemon=True)
t2.start()
"""

In [None]:
"""
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt = "You are a text-similarity evaluator. Your role is to analyze all the couple of tweets of users and calculate the semantic similarity between them. You must assign to each couple a decimal score from 0 (if the tweets are not similar) to 1 (if the tweets are similar). You have to give ONLY the number score, NOT anymore. If a tweet has offensive language, ignore it and DON'T answer. Give me a fast solution."

llm = Ollama(
    model="llama3"
)  # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3 `

template = ChatPromptTemplate.from_messages([
    ("system", prompt),
    ("user", "{input}"),
])

output_parser = StrOutputParser()


def ask_to_llama(tweet1,tweet2):   
    #chain = template | llm | output_parser
    
    #response = chain.invoke({"input": "Tweet 1:" +tweet1+ ". Tweet 2:" +tweet2})
    response = llm.invoke(prompt + "Tweet 1:" +tweet1+ ". Tweet 2:" +tweet2)
    
    return response
"""

In [None]:
"""
import json

partition = first_part #second_part #third_part #fourth_part #fifth_part
# Specifica il nome del file JSON
filename = f'/kaggle/working/similarities_{partition}.json'
records = []

#aggiusta qui! non va bene df
df = pd.concat([partition.drop(columns=["user_followers_count"]),last_1000.drop(columns=["user_followers_count"])],axis=0)
for (user1, tweet1), (user2, tweet2) in tqdm(combinations(df.itertuples(index=False), 2)):
    resp = ask_to_llama(tweet1,tweet2)
    print(resp)
    record = {
        "user1": user1,
        "user2": user2,
        "similarity": resp
    }
    records.append(record)
    
# Scrivi i dati nel file JSON
with open(filename, 'w') as file:
    json.dump(records, file, indent=4)
"""