In [1]:
import pandas as pd
from src.preprocess_dataframe import clean_dataframe, process_tokens
import numpy as np
import torch

import networkx as nx
from networkx.algorithms.community.louvain import louvain_communities

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from umap import UMAP
import random

  from .autonotebook import tqdm as notebook_tqdm


# Load the dataset:

In [2]:
data_name = ['abortion', 'gun'][0]

print(f'Loading data for {data_name}...')

df = pd.read_feather(f'data/{data_name}/tweets.feather')
df

Loading data for abortion...


Unnamed: 0,user_id,tweets
0,1511561411696148480,[@Mumblesloudly @charliekirk11 I feel the same...
1,1519024813561368576,"[Correction:\n\n""We caused the worst."" https:/..."
2,14549846,[@SaleemulHuq2's retweet inspired me to try an...
3,906108505513484289,"[@ClaudetteGGibs1 The letter E, @StellaBoyd23 ..."
4,478908708,"[@chrismattmann @BT @betty_nft SAME! 😅, @chris..."
...,...,...
5088,636000396,[@bibicosplays Crossing fingers for you and Ol...
5089,999149146429906944,"[y'all are killing me https://t.co/ralAvVKKn4,..."
5090,1346931516434239489,"[https://t.co/nmYSCxKZS3, @cspan @jonstewart I..."
5091,335060282,"[Ok, I gotta go to bed. Thanks for the enterta..."


In [3]:
G = nx.read_gml(f"data/{data_name}/graph.gml")

print(f"Number of nodes: {G.number_of_nodes()}, Number of edges: {G.number_of_edges()}")

# is dicrected
if nx.is_directed(G):
    print("Directed graph")
    # make it undirected
    G = G.to_undirected()
    print("Converted to undirected")
    
# is weighted
print(f"Is weighted: {nx.is_weighted(G)}")

# is connected
print(f"Is connected: {nx.is_connected(G)}")


users_ids = df['user_id'].to_numpy()
print(f"Number of users: {len(users_ids)}")
print(f"Number of unique users: {len(np.unique(users_ids))}")


# Remove the users form the graph that are not in the users_ids
G = G.subgraph(users_ids)
print(f"Number of nodes: {G.number_of_nodes()}, Number of edges: {G.number_of_edges()}")
print(f"Is connected: {nx.is_connected(G)}")

# Number of connected components
print(f"Number of connected components: {nx.number_connected_components(G)}")

print("The size of the connected components:")
# print the size of the connected components
for component in nx.connected_components(G):
    print(len(component))
    
# get the largest connected component
lcc = max(nx.connected_components(G), key=len)
print(f"Size of the largest connected component: {len(lcc)}")
G = G.subgraph(lcc)
print(f"Number of nodes: {G.number_of_nodes()}, Number of edges: {G.number_of_edges()}")
print(f"Is connected: {nx.is_connected(G)}")

Number of nodes: 5556, Number of edges: 11697
Is weighted: False
Is connected: True
Number of users: 5093
Number of unique users: 5093
Number of nodes: 5093, Number of edges: 10585
Is connected: False
Number of connected components: 2
The size of the connected components:
5092
1
Size of the largest connected component: 5092
Number of nodes: 5092, Number of edges: 10585
Is connected: True


# Detecting communities:

In [4]:
from networkx.algorithms.community.louvain import louvain_communities

In [5]:
G.number_of_edges(), G.number_of_nodes()

(10585, 5092)

In [6]:
# Get the communities
communities = louvain_communities(G, resolution=0.05, seed=42)

In [7]:
# Make the bigger communities the first ones always (To make easier to remember)
communities = sorted(communities, key=lambda x: len(x), reverse=True)

print(f"Number of communities: {len(communities)}")
for i, community in enumerate(communities):
    print(f"Community {i}: {len(community)}")

Number of communities: 2
Community 0: 3945
Community 1: 1147


In [8]:
# users_labels = []
# for uid, _ in uid_to_index.items():
#     for i, community in enumerate(communities):
#         if uid in community:
#             users_labels.append(i)
#             break
# users_labels[:5]
# len(users_labels)
# users_labels = np.array(users_labels)
# # Save the users_labels
# np.save(f"temp/{data_name}_hetesim/users_labels_(Same_word_as_Tweets2Users).npy", users_labels)

In [9]:
assert len(communities) == 2, "Number of communities is not 2"

# balance the communities by randomly selecting number of users equal to the smaller community
com_0 = list(communities[0])
com_1 = list(communities[1])

print(f"Before: Community 0: {len(com_0)}, Community 1: {len(com_1)}")

random.seed(42)
if len(com_0) > len(com_1):
    com_0 = random.sample(com_0, len(com_1))
else:
    com_1 = random.sample(com_1, len(com_0))
    
print(f"After: Community 0: {len(com_0)}, Community 1: {len(com_1)}")

Before: Community 0: 3945, Community 1: 1147
After: Community 0: 1147, Community 1: 1147


In [10]:
all_users = com_0 + com_1
print(f"Number of users: {len(all_users)}")

Number of users: 2294


In [11]:
df

Unnamed: 0,user_id,tweets
0,1511561411696148480,[@Mumblesloudly @charliekirk11 I feel the same...
1,1519024813561368576,"[Correction:\n\n""We caused the worst."" https:/..."
2,14549846,[@SaleemulHuq2's retweet inspired me to try an...
3,906108505513484289,"[@ClaudetteGGibs1 The letter E, @StellaBoyd23 ..."
4,478908708,"[@chrismattmann @BT @betty_nft SAME! 😅, @chris..."
...,...,...
5088,636000396,[@bibicosplays Crossing fingers for you and Ol...
5089,999149146429906944,"[y'all are killing me https://t.co/ralAvVKKn4,..."
5090,1346931516434239489,"[https://t.co/nmYSCxKZS3, @cspan @jonstewart I..."
5091,335060282,"[Ok, I gotta go to bed. Thanks for the enterta..."


In [12]:
# Keep users in the all_users list
df = df[df['user_id'].isin(all_users)]

df

Unnamed: 0,user_id,tweets
0,1511561411696148480,[@Mumblesloudly @charliekirk11 I feel the same...
1,1519024813561368576,"[Correction:\n\n""We caused the worst."" https:/..."
3,906108505513484289,"[@ClaudetteGGibs1 The letter E, @StellaBoyd23 ..."
5,1537105274757976065,[@somajoe69 @NicoleNaditz @trippindaisies @lav...
7,1462579612848951300,"[@joelpollak @DarnelSugarfoo Rosie’s, I just w..."
...,...,...
5084,1489362675889065985,[@iluminatibot Were the puppy leggings built i...
5085,1424448523244871688,"[😂😂😂 https://t.co/i7EkwHwR5l, https://t.co/v2L..."
5086,438584384,"[Us https://t.co/o5YP3poDpM, Donate any amount..."
5089,999149146429906944,"[y'all are killing me https://t.co/ralAvVKKn4,..."


In [13]:
# Clean the dataframe

# Explore tweets and rename it to tweet
df = df.rename(columns={'tweets': 'tweet'})
df = df.explode('tweet')

df = clean_dataframe(df)

Number of tweets: 496714 before Processing
Start processing tweets:
    1. Removed 0 empty tweets (total = 496714)
    2. Removed 4400 duplicates tweets (total = 492314)
    3. Removed 289682 tweets that start with @ (total = 202632)
    4. Removed 563 tweets that start with 'Wordle' (total = 202069)
    5. Removing URLs:


100%|██████████| 202069/202069 [00:00<00:00, 665972.16it/s]


    6. Removing HTML entities:


100%|██████████| 202069/202069 [00:00<00:00, 2338702.62it/s]


    7. Replacing ' with space:
    8. Tokenizing tweets


100%|██████████| 202069/202069 [00:09<00:00, 20596.35it/s]
100%|██████████| 202069/202069 [00:00<00:00, 405096.87it/s]


    9. Removed 57288 tweets with less than 5 words (total = 144781)
    * Removing non english languages
    10. Removed 4011 non english languages (total = 140770)
    11. Removing white space:


100%|██████████| 140770/140770 [00:00<00:00, 240985.84it/s]


    12. Removing numbers:


100%|██████████| 140770/140770 [00:00<00:00, 527184.29it/s]
100%|██████████| 140770/140770 [00:01<00:00, 73079.15it/s]


    13. Removed 0 empty tweets and less than 5 words (total = 140770)
    14. Removing HTML entities again:


100%|██████████| 140770/140770 [00:00<00:00, 2463655.03it/s]


    15. Removed 2315 duplicates tweets (total = 138455)
Number of tweets: 138455 after processing


In [14]:
df

Unnamed: 0,user_id,tweet
0,1511561411696148480,The ex came by to wish me a Happy th. 🤣
0,1511561411696148480,Ok try not to Shit on yourself we forget your ...
0,1511561411696148480,Possibly using sperm and the egg from unvaccin...
0,1511561411696148480,Hear me out first. Movies always ahead of time...
0,1511561411696148480,Honestly I don t even believe that Trump can w...
...,...,...
5092,987419369599676416,"As in the US, the power still lies in the peop..."
5092,987419369599676416,"""Here’s the truth and until a threshold of Ame..."
5092,987419369599676416,. percent of American voters don’t fully trust...
5092,987419369599676416,Right up until that LAST DEM BALLOTT DUMP.


In [15]:
# # Process tokens
# tokens = df['tokens'].tolist()
# new_tweets = process_tokens(tokens)

# df['new_tweet'] = new_tweets
# del tokens, new_tweets

# df

# Topic modeling:

In [16]:
!nvidia-smi

Wed Oct  2 22:53:12 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070        Off | 00000000:0B:00.0  On |                  N/A |
|  0%   54C    P8              26W / 220W |     76MiB /  8192MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [17]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # the smaller model
# model = SentenceTransformer('sentence-transformers/sentence-t5-base')  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [18]:
all_tweets = df["tweet"].to_numpy()
len(all_tweets)

138455

In [19]:
embeddings = model.encode(all_tweets, show_progress_bar=True, batch_size=240, device=device)

Batches: 100%|██████████| 577/577 [00:23<00:00, 24.55it/s]


In [20]:
embeddings.shape

(138455, 384)

In [21]:
umap = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

umap_embeddings = umap.fit_transform(embeddings)

In [22]:
umap_embeddings.shape

(138455, 5)

In [23]:
umap_model = BaseDimensionalityReduction()  # Empty model
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") #, min_df=50)

vectorizer_model = CountVectorizer(
    # ngram_range=(1, 2),
    stop_words="english",
    min_df=10, # originally 15
    # token_pattern=r"\b\w\w+\b|(?<!\w)@\w+|(?<!\w)#\w+",
)
# Note: You can add `|[^\s]+` to get emoji however it's not the best way because it get more than emojis. The best way to get emojis is to use the emoji library way of finding emojis. look here: https://carpedm20.github.io/emoji/docs/#regular-expression

# vectorizer_model = TfidfVectorizer(
#     ngram_range=(1, 2),
#     stop_words="english",
#     min_df=10, # originally 15
#     token_pattern=r"\b\w\w+\b|(?<!\w)@\w+|(?<!\w)#\w+",
# )


# ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

topic_model = BERTopic(
    umap_model=umap_model,
    verbose=True,
    vectorizer_model=vectorizer_model,
    min_topic_size=400, # originally 400
    language="dont_preprocess_please",
    # top_n_words=100,
)
# BERTopic prepossess English text by default

In [24]:
# new_all_tweets = df["new_tweet"].to_numpy()
new_all_tweets = all_tweets

topics, probs = topic_model.fit_transform(new_all_tweets, umap_embeddings)

2024-10-02 22:54:24,967 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-02 22:54:24,968 - BERTopic - Dimensionality - Completed ✓
2024-10-02 22:54:24,969 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

2024-10-02 22:54:43,767 - BERTopic - Cluster - Completed ✓
2024-10-02 22:54:43,780 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-02 22:54:44,721 - BERTopic - Representation - Completed ✓


In [25]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,69645,-1_people_like_just_don,"[people, like, just, don, time, new, trump, go...",[I ve been thinking about #Sanditon in readine...
1,0,11014,0_fbi_trump_documents_doj,"[fbi, trump, documents, doj, gun, mar, lago, p...","[The FBI fabricated all of this., Trump had mo..."
2,1,6716,1_climate_inflation_energy_gas,"[climate, inflation, energy, gas, water, tax, ...","[""There are powerful hurricanes being formed o..."
3,2,4810,2_vote_democrats_gop_republicans,"[vote, democrats, gop, republicans, party, rep...",[No split votes would give a win to the dems. ...
4,3,4409,3_know_people_just_don,"[know, people, just, don, really, like, think,...","[They really, really think we are stupid... 🤣 ..."
5,4,3314,4_abortion_roe_abortions_pro,"[abortion, roe, abortions, pro, women, babies,...","[Abortion Facts Precede Abortion Law, Abortion..."
6,5,3145,5_god_jesus_church_lord,"[god, jesus, church, lord, bible, christian, c...","[There is no loneliness where God is., Mary, m..."
7,6,3008,6_covid_vaccine_pandemic_virus,"[covid, vaccine, pandemic, virus, health, deat...","[Only days in iso for those ""without symptoms..."
8,7,2727,7_women_trans_gender_men,"[women, trans, gender, men, children, sex, tra...","[There is no such thing as a ""cis"" woman. Ther..."
9,8,2388,8_twitter_tweet_musk_elon,"[twitter, tweet, musk, elon, tweets, account, ...",[Can t wait to hear the response from all the ...


In [26]:
# topic_model.reduce_topics(new_all_tweets, nr_topics=30) # No need to reduce the topics

In [27]:
topic_model.get_topic_info().head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,69645,-1_people_like_just_don,"[people, like, just, don, time, new, trump, go...",[I ve been thinking about #Sanditon in readine...
1,0,11014,0_fbi_trump_documents_doj,"[fbi, trump, documents, doj, gun, mar, lago, p...","[The FBI fabricated all of this., Trump had mo..."
2,1,6716,1_climate_inflation_energy_gas,"[climate, inflation, energy, gas, water, tax, ...","[""There are powerful hurricanes being formed o..."
3,2,4810,2_vote_democrats_gop_republicans,"[vote, democrats, gop, republicans, party, rep...",[No split votes would give a win to the dems. ...
4,3,4409,3_know_people_just_don,"[know, people, just, don, really, like, think,...","[They really, really think we are stupid... 🤣 ..."
5,4,3314,4_abortion_roe_abortions_pro,"[abortion, roe, abortions, pro, women, babies,...","[Abortion Facts Precede Abortion Law, Abortion..."
6,5,3145,5_god_jesus_church_lord,"[god, jesus, church, lord, bible, christian, c...","[There is no loneliness where God is., Mary, m..."
7,6,3008,6_covid_vaccine_pandemic_virus,"[covid, vaccine, pandemic, virus, health, deat...","[Only days in iso for those ""without symptoms..."
8,7,2727,7_women_trans_gender_men,"[women, trans, gender, men, children, sex, tra...","[There is no such thing as a ""cis"" woman. Ther..."
9,8,2388,8_twitter_tweet_musk_elon,"[twitter, tweet, musk, elon, tweets, account, ...",[Can t wait to hear the response from all the ...


In [28]:
topic_model.vectorizer_model.get_feature_names_out().shape

(5890,)

In [29]:
# Store feature names into a file
# with open(f'feature_names.txt', 'w') as f:
#     for item in topic_model.vectorizer_model.get_feature_names_out():
#         f.write("%s\n" % item)
# the words of topic 3
# topic_model.get_topic(0)
# df

In [30]:
print(f"Number of topics: {len(topics)}")
print(f"Number of tweets: {len(df)}")
assert len(topics) == len(df), "The number of topics is not equal to the number of tweets"

df["topic"] = topics

df[df["topic"] == -1].tweet.sample(10).tolist()

Number of topics: 138455
Number of tweets: 138455


['Social Credit Score system coming soon. Just like in China or even worse. #SocialCredit',
 'Your mouth to God s ears 💙',
 'But, people *want* to think they are doing tabatas, and will brag about their tabata treadmills and tabata planks and tabata deadlifts or such. This is because it seems cool, and sexy.',
 'I just threw up in my mouth :(',
 'Very sad to learn about the passing of Congresswoman Jackie Walorski and two of her staff members. My heart is with their loved ones and the entire staff during this difficult time.',
 'Dear Presidents, Senators, Congress Men and Women, Governors, all politicians and representatives,',
 'Do you see what they are doing? They are going to get people killed.',
 'The first screenshot literally shows a video where I attack conservatives, speak at penn state and say conservative censorship is overblown, and debunking race realist talking points. But go off king!',
 'the only crime that matters in fiction is being boring',
 'From the criminal homeles

In [31]:
df[df["topic"] == 2].tweet.sample(10).tolist()

['Seems the #GOP wants to function like the #Taliban. They limit the #rights of women, restrict #voting rights & want to whitewash US #History. And they can t figure out why they are losing voters.',
 'TODAY!!! Get those ballots in or come in to ANY vote center to get a new one printed!!! Your voice matters 🇺🇸',
 'Let the fist-pumping COMMENCE: Kick-a*s thread lays out what the GOP agenda SHOULD be and BOO-FREAKIN’-YAH via @twitchyteam',
 'Republican leaders lie about being for freedom but they vote against everything good for Americans including Law and Order. This is the midterm message! Republicans are unAmerican.',
 'Democrat turnout could have been higher than Republican in NY- because they also had primaries and another special election to make it worth getting out and voting.',
 '🇺🇸🇺🇸🇺🇸Donald J. Trump🇺🇸🇺🇸🇺🇸 “We the peoples th President”',
 'He lives in DELUSIONAL AMERICA where he is president ! The men in the white coats are waiting to take him back to the white house !',
 'Plea

In [32]:
df["topic"] = topics

# Remove topic -1
df = df[df["topic"] != -1]


df

Unnamed: 0,user_id,tweet,topic
0,1511561411696148480,Ok try not to Shit on yourself we forget your ...,3
0,1511561411696148480,Possibly using sperm and the egg from unvaccin...,6
0,1511561411696148480,Hear me out first. Movies always ahead of time...,6
0,1511561411696148480,I don t care if they get Brittany griner out o...,10
0,1511561411696148480,They would love to bankrupt our economy so the...,1
...,...,...,...
5092,987419369599676416,"If you read one thing today, this is it. What ...",3
5092,987419369599676416,Dis guy right here has the receipts.,0
5092,987419369599676416,So basically the three letter agencies were / ...,8
5092,987419369599676416,Right up until that LAST DEM BALLOTT DUMP.,2


In [33]:
# Remove the tokens column
# df = df.drop(columns=["tokens"])

# reindex the dataframe
df = df.reset_index(drop=True)

df

Unnamed: 0,user_id,tweet,topic
0,1511561411696148480,Ok try not to Shit on yourself we forget your ...,3
1,1511561411696148480,Possibly using sperm and the egg from unvaccin...,6
2,1511561411696148480,Hear me out first. Movies always ahead of time...,6
3,1511561411696148480,I don t care if they get Brittany griner out o...,10
4,1511561411696148480,They would love to bankrupt our economy so the...,1
...,...,...,...
68805,987419369599676416,"If you read one thing today, this is it. What ...",3
68806,987419369599676416,Dis guy right here has the receipts.,0
68807,987419369599676416,So basically the three letter agencies were / ...,8
68808,987419369599676416,Right up until that LAST DEM BALLOTT DUMP.,2


In [34]:
df_topic = topic_model.get_topic_info()

# Remove topic -1
df_topic = df_topic[df_topic["Topic"] != -1]

# remove Topic column and reindex
df_topic = df_topic.reset_index(drop=True)
df_topic.drop(columns=["Topic"], inplace=True)

df_topic

Unnamed: 0,Count,Name,Representation,Representative_Docs
0,11014,0_fbi_trump_documents_doj,"[fbi, trump, documents, doj, gun, mar, lago, p...","[The FBI fabricated all of this., Trump had mo..."
1,6716,1_climate_inflation_energy_gas,"[climate, inflation, energy, gas, water, tax, ...","[""There are powerful hurricanes being formed o..."
2,4810,2_vote_democrats_gop_republicans,"[vote, democrats, gop, republicans, party, rep...",[No split votes would give a win to the dems. ...
3,4409,3_know_people_just_don,"[know, people, just, don, really, like, think,...","[They really, really think we are stupid... 🤣 ..."
4,3314,4_abortion_roe_abortions_pro,"[abortion, roe, abortions, pro, women, babies,...","[Abortion Facts Precede Abortion Law, Abortion..."
5,3145,5_god_jesus_church_lord,"[god, jesus, church, lord, bible, christian, c...","[There is no loneliness where God is., Mary, m..."
6,3008,6_covid_vaccine_pandemic_virus,"[covid, vaccine, pandemic, virus, health, deat...","[Only days in iso for those ""without symptoms..."
7,2727,7_women_trans_gender_men,"[women, trans, gender, men, children, sex, tra...","[There is no such thing as a ""cis"" woman. Ther..."
8,2388,8_twitter_tweet_musk_elon,"[twitter, tweet, musk, elon, tweets, account, ...",[Can t wait to hear the response from all the ...
9,2291,9_biden_joe_speech_president,"[biden, joe, speech, president, trump, maga, d...","[Joe Biden don’t care., From Joe Biden and the..."


In [35]:
# Store the tweets and topic into parquet files
df.to_parquet(f"output/{data_name}/tweets.parquet")
df_topic.to_parquet(f"output/{data_name}/topics.parquet")

In [36]:
word_list = topic_model.vectorizer_model.get_feature_names_out()

# make a new dataframe with the words
df_words = pd.DataFrame(word_list, columns=["word"])
df_words.to_parquet(f"output/{data_name}/list_of_words.parquet")

df_words

Unnamed: 0,word
0,abandon
1,abandoned
2,abbott
3,abc
4,abcaustralia
...,...
5885,zealand
5886,zero
5887,zerohedge
5888,zip


# Building the HIN:

In [37]:
df

Unnamed: 0,user_id,tweet,topic
0,1511561411696148480,Ok try not to Shit on yourself we forget your ...,3
1,1511561411696148480,Possibly using sperm and the egg from unvaccin...,6
2,1511561411696148480,Hear me out first. Movies always ahead of time...,6
3,1511561411696148480,I don t care if they get Brittany griner out o...,10
4,1511561411696148480,They would love to bankrupt our economy so the...,1
...,...,...,...
68805,987419369599676416,"If you read one thing today, this is it. What ...",3
68806,987419369599676416,Dis guy right here has the receipts.,0
68807,987419369599676416,So basically the three letter agencies were / ...,8
68808,987419369599676416,Right up until that LAST DEM BALLOTT DUMP.,2


In [38]:
# TODO FIX me
G.number_of_edges(), G.number_of_nodes()

(10585, 5092)

In [39]:
# Remove the users that are not in the graph
# df = df[df['user_id'].isin(G.nodes)]
# df

In [40]:
vectorizer_model = topic_model.vectorizer_model
vectorizer_model

In [41]:
# Vectorize the new_tweet
X_Tweets = vectorizer_model.transform(df['tweet']) # This is sparse matrix

In [42]:
# Only for TfidfVectorizer if the value is more than 0 make it 1
# if isinstance(vectorizer_model, TfidfVectorizer):
#     X_Tweets[X_Tweets > 0] = 1
#     print("Converted to binary")
X_Tweets[X_Tweets > 0] = 1
print("Converted to binary")
    
X_Tweets[0].toarray().flatten()[100:500]

Converted to binary


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [43]:
# X_document = X_document.toarray()
# X_document.shape

In [44]:
X_Tweets.shape

(68810, 5890)

In [45]:
# Save the vectorized document as sparse matrix
from scipy.sparse import save_npz, load_npz
save_npz(f"output/{data_name}/X_Tweets_sparse.npz", X_Tweets)

# Tweets to Topics matrix:

In [46]:
df = df.reset_index(drop=True)
df

Unnamed: 0,user_id,tweet,topic
0,1511561411696148480,Ok try not to Shit on yourself we forget your ...,3
1,1511561411696148480,Possibly using sperm and the egg from unvaccin...,6
2,1511561411696148480,Hear me out first. Movies always ahead of time...,6
3,1511561411696148480,I don t care if they get Brittany griner out o...,10
4,1511561411696148480,They would love to bankrupt our economy so the...,1
...,...,...,...
68805,987419369599676416,"If you read one thing today, this is it. What ...",3
68806,987419369599676416,Dis guy right here has the receipts.,0
68807,987419369599676416,So basically the three letter agencies were / ...,8
68808,987419369599676416,Right up until that LAST DEM BALLOTT DUMP.,2


In [47]:
num_of_topics = len(df.topic.unique())
num_of_topics

35

In [48]:
# Make a sparse matrix of tweets and topics
from scipy.sparse import csr_matrix

In [49]:
Tweets2Topics = csr_matrix((np.ones(len(df)), (df.index, df.topic)), shape=(len(df), num_of_topics))

In [50]:
Tweets2Topics[0].toarray()

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]])

In [51]:
# Save the matrix
save_npz(f"output/{data_name}/Tweets2Topics_sparse.npz", Tweets2Topics)

In [52]:
Tweets2Topics.shape

(68810, 35)

# User2Tweet matrix:

In [53]:
df

Unnamed: 0,user_id,tweet,topic
0,1511561411696148480,Ok try not to Shit on yourself we forget your ...,3
1,1511561411696148480,Possibly using sperm and the egg from unvaccin...,6
2,1511561411696148480,Hear me out first. Movies always ahead of time...,6
3,1511561411696148480,I don t care if they get Brittany griner out o...,10
4,1511561411696148480,They would love to bankrupt our economy so the...,1
...,...,...,...
68805,987419369599676416,"If you read one thing today, this is it. What ...",3
68806,987419369599676416,Dis guy right here has the receipts.,0
68807,987419369599676416,So basically the three letter agencies were / ...,8
68808,987419369599676416,Right up until that LAST DEM BALLOTT DUMP.,2


In [54]:
uid_to_index = {uid: i for i, uid in enumerate(df['user_id'].unique())}

In [55]:
df['user_index'] = df['user_id'].map(uid_to_index)
df

Unnamed: 0,user_id,tweet,topic,user_index
0,1511561411696148480,Ok try not to Shit on yourself we forget your ...,3,0
1,1511561411696148480,Possibly using sperm and the egg from unvaccin...,6,0
2,1511561411696148480,Hear me out first. Movies always ahead of time...,6,0
3,1511561411696148480,I don t care if they get Brittany griner out o...,10,0
4,1511561411696148480,They would love to bankrupt our economy so the...,1,0
...,...,...,...,...
68805,987419369599676416,"If you read one thing today, this is it. What ...",3,2040
68806,987419369599676416,Dis guy right here has the receipts.,0,2040
68807,987419369599676416,So basically the three letter agencies were / ...,8,2040
68808,987419369599676416,Right up until that LAST DEM BALLOTT DUMP.,2,2040


In [56]:
# Create a sparse matrix of tweets and the users
Tweets2Users = csr_matrix((np.ones(len(df)), (df.index, df.user_index)), shape=(len(df), len(uid_to_index)))
Tweets2Users.shape

(68810, 2041)

In [57]:
# Save the matrix
save_npz(f"output/{data_name}/Tweets2Users_sparse.npz", Tweets2Users)

In [58]:
# Save the uid_to_index as parquet
df_uid_to_index = pd.DataFrame(list(uid_to_index.items()), columns=['user_id', 'user_index'])
df_uid_to_index.to_parquet(f"output/{data_name}/uid_to_index.parquet")
df_uid_to_index

Unnamed: 0,user_id,user_index
0,1511561411696148480,0
1,1519024813561368576,1
2,1537105274757976065,2
3,438897163,3
4,55712928,4
...,...,...
2036,1489362675889065985,2036
2037,1424448523244871688,2037
2038,438584384,2038
2039,999149146429906944,2039


# Users Labels:

In [59]:
# After detecting the communities using the louvine algorithm
print(f"Number of communities: {len(communities)}")

for i, community in enumerate(communities):
    print(f"    -> Community {i}: {len(community)}")
    
print("We balanced the communities by randomly selecting number of users equal to the smaller community")
print(f"    -> Community 0: {len(com_0)}, Community 1: {len(com_1)}")

Number of communities: 2
    -> Community 0: 3945
    -> Community 1: 1147
We balanced the communities by randomly selecting number of users equal to the smaller community
    -> Community 0: 1147, Community 1: 1147


In [60]:
users_labels = []
for uid, _ in uid_to_index.items():
    for i, community in enumerate(communities):
        if uid in com_0:
            users_labels.append(0)
            break
        elif uid in com_1:
            users_labels.append(1)
            break
        else:
            raise ValueError("User not in any community")
        
users_labels[:5]

[1, 1, 1, 0, 0]

In [61]:
print(f"Number of users: {len(users_labels)}")

users_labels = np.array(users_labels)

Number of users: 2041


In [62]:
# Save the users_labels
np.save(f"output/{data_name}/users_labels_(Same_word_as_Tweets2Users).npy", users_labels)