In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import networkx as nx
from collections import Counter
from bertopic import BERTopic




In [2]:
# Cargar el dataset
df = pd.read_csv("tweets.csv", encoding="latin-1", header=None)

In [3]:
# Limitar el DataFrame a los primeros 10000 registros
df = df.head(20000)

In [4]:
# Extraer los tweets y usuarios
tweets = df[5].astype(str).tolist()  # Convertir a lista para TF-IDF
usuarios = df[4].astype(str)

In [5]:
# Preprocesamiento de texto: eliminar URLs y caracteres especiales
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Eliminar URLs
    text = re.sub(r"[^\w\s]", "", text)  # Eliminar caracteres especiales
    text = re.sub(r"\b\d+\b", "", text)  # Eliminar números sin texto
    return text.lower()

tweets_cleaned = [clean_text(tweet) for tweet in tweets]

In [6]:
tweets[:5]

["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
 "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
 '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
 'my whole body feels itchy and like its on fire ',
 "@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "]

In [7]:
topic_model = BERTopic(calculate_probabilities=True)  # No need to specify k!
len(tweets)
topics, probs = topic_model.fit_transform(tweets[:20000])

# See number of topics found
print(f"Number of topics detected: {len(set(topics))}")

Number of topics detected: 232


In [8]:
## for topic, words in topic_model.get_topics().items():
for topic_id, words in topic_model.get_topics().items():
    word_list = [word[0] for word in words]
    print(f"Topic {topic_id}: {', '.join(word_list)}")

Topic -1: to, you, the, and, it, in, that, me, but, of
Topic 0: sleep, tired, bed, awake, early, up, sleeping, wake, hours, night
Topic 1: food, hungry, eat, ate, chocolate, dinner, lunch, breakfast, eating, cheese
Topic 2: rain, raining, rainy, weather, its, outside, here, sunny, nice, today
Topic 3: phone, iphone, blackberry, cell, battery, my, mobile, cellphone, charger, sprint
Topic 4: laptop, macbook, windows, computer, mac, pc, virus, pro, install, vista
Topic 5: car, bike, bus, traffic, train, ride, stuck, wheel, driving, parking
Topic 6: snow, snowing, april, winter, ground, snowed, spring, its, cold, outside
Topic 7: miss, you, too, see, misses, missed, kalpenn, here, youuu, leaving
Topic 8: know, mandayyy, yeah, welcome, yes, 21stcenturyfox, pedos, paulknebel, owwwwwww, ortega
Topic 9: sunday, saturday, tuesday, friday, weekend, monday, week, weekends, tuesdays, saturdays
Topic 10: study, test, exam, exams, studying, tests, belajar, assessments, quiz, pass
Topic 11: followers

In [9]:
# test the topic classification using a custom tweet
new_text = "i got a gym trainer and i hug"
topic, prob = topic_model.transform([new_text])

if topic != [-1]:
        print(f"Assigned Topic: {topic[0]}")
        print(f"Probability: {prob[0]}")
        print(f"Probability of it belonging to topic -1: {prob[0][0]}")
        print(f"Assigned Topics: {partial_sort(prob[0],prob[0][0])}")

Assigned Topic: 120
Probability: [9.40774083e-06 4.73765786e-06 5.52122745e-06 4.24352971e-06
 4.22858882e-06 6.55150314e-06 5.42262290e-06 4.47701006e-06
 4.03597111e-06 7.05958909e-06 5.77890219e-06 3.71811339e-06
 7.51025555e-06 4.18310708e-06 5.68770432e-06 6.89705248e-06
 4.44330763e-06 4.79940961e-06 7.57793908e-06 3.99518632e-06
 4.88635666e-06 4.78890521e-06 6.21296849e-06 5.90037042e-06
 3.85228779e-06 4.62766786e-06 6.39280929e-06 4.61291695e-06
 4.65191221e-06 5.28544820e-06 3.97014257e-06 7.77438688e-06
 3.91155230e-06 4.55454077e-06 6.70912848e-06 5.08693173e-06
 6.07361971e-06 5.79684343e-06 4.05806015e-06 3.07127494e-06
 4.29354662e-06 4.14413072e-06 6.26799753e-06 5.55098034e-06
 5.75627654e-06 4.24868837e-06 6.39953791e-06 6.01090057e-06
 6.03287563e-06 4.65929099e-06 3.94150871e-06 7.52786172e-06
 4.40559053e-06 5.99613338e-06 7.30428995e-06 3.13581900e-06
 4.14736025e-06 6.26158177e-06 4.14908349e-06 4.25635309e-06
 2.45478969e-06 4.24912354e-06 4.59688410e-06 4.2408

NameError: name 'partial_sort' is not defined

In [21]:
len(df)

20000

In [11]:
def partial_sort(arr, threshold):
    total = sum(arr)  # Compute sum of all probabilities
    if total == 0:
        return []  # Avoid division by zero

    normalized = [x / total for x in arr]  # Normalize probabilities

    # Sort (index, normalized value) pairs in descending order
    sorted_indices = sorted(enumerate(normalized), key=lambda x: x[1], reverse=True)

    if not sorted_indices:  # Avoid indexing an empty list
        return []

    # Set dynamic threshold: must be at least `threshold` and at least 25% of max value
    #in case you wanna return the probability too, just write val in the return value of the next line
    return [(idx) for idx, val in sorted_indices if val >= threshold and val >= sorted_indices[0][1] * 0.5]

In [12]:
# array where each item is a tuple with the user and the tweet
user_tweet_pairs = list(zip(df[4].astype(str), df[5].astype(str)))

In [13]:
# this one prints each tweet and the probability that it belongs to each topic
# dont need to run this to run the subsequent function
for user, tweet in user_tweet_pairs[:100]:
    topic, prob = topic_model.transform([tweet])
    if topic != [-1]:
        print(tweet)
        print(f"Assigned Topic: {topic}")
        print(f"Probability: {prob[0]}")
        print(f"Probability of it belonging to topic -1: {prob[0][0]}")
        print(f"Assigned Topics: {partial_sort(prob[0],prob[0][0],)}")

my whole body feels itchy and like its on fire 
Assigned Topic: [87]
Probability: [0.00185982 0.00180174 0.00172623 0.00133441 0.00132647 0.00202964
 0.00178831 0.00145616 0.00135447 0.00165283 0.0014359  0.00120569
 0.00172472 0.00230802 0.00143084 0.00187098 0.00146912 0.00152275
 0.00225249 0.00145291 0.00152982 0.00151115 0.00193742 0.00375867
 0.00123476 0.00143384 0.0015302  0.00156374 0.00149509 0.0016747
 0.00128165 0.00264318 0.00125223 0.00150613 0.00180249 0.00187329
 0.0027227  0.00185287 0.00132422 0.00112957 0.00133743 0.00133734
 0.00243409 0.00142685 0.00168251 0.00140005 0.00175295 0.00180512
 0.00209122 0.00150263 0.00125361 0.00182117 0.00147622 0.00171395
 0.00168087 0.00103606 0.00130557 0.0020855  0.00129034 0.00142669
 0.00087155 0.0013431  0.00150824 0.00135163 0.00047668 0.00117265
 0.00131933 0.0020139  0.0014181  0.01236813 0.00098563 0.00137467
 0.0014819  0.00130613 0.00150043 0.00182091 0.00158953 0.00221803
 0.01569666 0.00152689 0.0007779  0.00175012 0.0

In [14]:
# this one asigns the tweet to several topics and stores it in an array. 
classificated_tweets = []  # Store results here

for user, tweet in user_tweet_pairs:
    topic, prob = topic_model.transform([tweet])

    if topic != [-1]:  # Only process valid topics
        assigned_topics = partial_sort(prob[0], prob[0][0])  # Get filtered topics
        
        # Store the result as a tuple (user, tweet, assigned_topics)
        classificated_tweets.append((user, tweet, assigned_topics))

# Example output
print(classificated_tweets[:100])  # Print first 5 results

[('mybirch', 'Need a hug ', [139]), ('coZZ', "@LOLTrish hey  long time no see! Yes.. Rains a bit ,only a bit  LOL , I'm fine thanks , how's you ?", [2]), ('mimismo', '@twittera que me muera ? ', [55, 89, 90, 188, 28, 193]), ('erinx3leannexo', "spring break in plain city... it's snowing ", [6]), ('robrobbierobert', '@octolinz16 It it counts, idk why I did either. you never talk to me anymore ', [132]), ('lovesongwriter', "Hollis' death scene will hurt me severely to watch on film  wry is directors cut not out now?", [79]), ('armotley', 'about to file taxes ', [148]), ('gi_gi_bee', '@FakerPattyPattz Oh dear. Were you drinking out of the forgotten table drinks? ', [16]), ('cooliodoc', '@angry_barista I baked you a cake but I ated it ', [1]), ('Ljelli3166', 'blagh class at 8 tomorrow ', [26]), ('ChicagoCubbie', 'I hate when I have to call and wake people up ', [0]), ('KatieAngell', 'Just going to cry myself to sleep after watching Marley and Me.  ', [152]), ('EmCDL', '@alielayus I want to 

In [24]:
topic_combinations = {}

for user, tweet, tweet_topics in classificated_tweets:
    if len(tweet_topics) > 1:  # Ensure there are at least 2 topics
        for i in range(len(tweet_topics)):
            for j in range(i + 1, len(tweet_topics)):  # Proper iteration
                topic1, topic2 = tweet_topics[i], tweet_topics[j]
                key = tuple(sorted((topic1, topic2)))  # Ensure consistent ordering
                
                if key in topic_combinations:
                    topic_combinations[key] += 1  # Increment count
                else:
                    topic_combinations[key] = 1  # Initialize count

# Print topic combinations
for key, value in topic_combinations.items():
    print(key, value)

(55, 89) 29
(55, 90) 20
(55, 188) 8
(28, 55) 10
(55, 193) 16
(89, 90) 20
(89, 188) 8
(28, 89) 10
(89, 193) 16
(90, 188) 16
(28, 90) 27
(90, 193) 35
(28, 188) 22
(188, 193) 21
(28, 193) 38
(20, 158) 45
(76, 158) 34
(90, 158) 37
(158, 228) 28
(20, 76) 27
(20, 90) 42
(20, 228) 26
(76, 90) 25
(76, 228) 24
(90, 228) 22
(2, 15) 57
(15, 22) 55
(15, 57) 30
(6, 15) 31
(2, 22) 60
(2, 57) 29
(2, 6) 32
(22, 57) 41
(6, 22) 42
(6, 57) 31
(11, 30) 26
(11, 143) 25
(11, 24) 25
(11, 50) 25
(11, 214) 24
(11, 202) 18
(11, 132) 15
(11, 32) 24
(30, 143) 36
(24, 30) 35
(30, 50) 35
(30, 214) 35
(30, 202) 27
(30, 132) 17
(30, 32) 34
(24, 143) 34
(50, 143) 34
(143, 214) 36
(143, 202) 27
(132, 143) 17
(32, 143) 34
(24, 50) 45
(24, 214) 33
(24, 202) 25
(24, 132) 17
(24, 32) 35
(50, 214) 34
(50, 202) 25
(50, 132) 17
(32, 50) 35
(202, 214) 28
(132, 214) 18
(32, 214) 36
(132, 202) 18
(32, 202) 26
(32, 132) 17
(113, 118) 13
(118, 216) 12
(33, 118) 5
(118, 206) 12
(118, 190) 10
(62, 118) 9
(113, 216) 20
(33, 113) 11
(

In [25]:
user_topics = {}
for user, tweet, tweet_topics in classificated_tweets:
    if user in user_topics:
        user_topics[user] = list(set(user_topics[user] + tweet_topics))
    else:
        user_topics[user] = tweet_topics
for key,value in user_topics.items():
    print(key,value)

mybirch [139]
coZZ [2, 22, 15]
mimismo [55, 89, 90, 188, 28, 193]
erinx3leannexo [6]
robrobbierobert [132]
lovesongwriter [79]
armotley [148]
gi_gi_bee [16, 8]
cooliodoc [1]
Ljelli3166 [26]
ChicagoCubbie [0]
KatieAngell [152]
EmCDL [158, 20, 76, 90, 228]
Pbearfox [7]
crosland_12 [21]
lionslamb [47]
kennypham [0]
hpfangirl94 [155]
labrt2004 [32]
tautao [56]
CiaraRenee [139]
deelau [17, 131]
gzacher [22]
xVivaLaJuicyx [66]
xpika [96]
ericg622 [15, 2, 22, 57, 6]
playboybacon [54]
emo_holic [47]
mscha [104]
calihonda2001 [36]
djwayneski [7]
Ceejison [108]
ItsBrigittaYo [1]
MissLaura317 [49, 205]
RoseMaryK [48]
perrohunter [47]
homeworld [11, 30, 143, 24, 50, 214, 202, 132, 32]
cityrat59 [56]
annette414 [128]
GetGary [4]
missannabanana [149]
Zella17 [126]
Jonas_Dreamgirl [8]
BreannaBonana [0]
becklyn13 [103]
sonyolmos [7]
BustaBusta [1]
MadameCrow [116]
bgoers [57]
MissPassion [2, 22, 15]
HiKeri [0]
R_Boucher [6]
josiahmcdermott [29]
varunkumar [118, 113, 216, 33, 206, 190, 62]
JenniOnTheBl

In [26]:
user_topic_combinations = {}
for user, tweet_topics in user_topics.items():
    if len(tweet_topics) > 1:
        for i in range(len(tweet_topics)):
            for j in range(i + 1, len(tweet_topics)):  # Proper iteration
                topic1, topic2 = tweet_topics[i], tweet_topics[j]
                key = tuple(sorted((topic1, topic2)))  # Ensure consistent ordering
                
                if key in user_topic_combinations:
                    user_topic_combinations[key] += 1  # Increment count
                else:
                    user_topic_combinations[key] = 1  # Initialize count
for key,value in user_topic_combinations.items():
    print(key,value)

(2, 22) 60
(2, 15) 59
(15, 22) 56
(55, 89) 27
(55, 90) 18
(55, 188) 7
(28, 55) 10
(55, 193) 15
(89, 90) 18
(89, 188) 7
(28, 89) 10
(89, 193) 15
(90, 188) 15
(28, 90) 27
(90, 193) 34
(28, 188) 22
(188, 193) 20
(28, 193) 38
(8, 16) 1
(20, 158) 45
(76, 158) 34
(90, 158) 37
(158, 228) 28
(20, 76) 27
(20, 90) 42
(20, 228) 26
(76, 90) 25
(76, 228) 24
(90, 228) 22
(17, 131) 2
(15, 57) 30
(6, 15) 31
(2, 57) 29
(2, 6) 32
(22, 57) 42
(6, 22) 42
(6, 57) 31
(49, 205) 1
(11, 30) 26
(11, 143) 25
(11, 24) 25
(11, 50) 26
(11, 214) 24
(11, 202) 18
(11, 132) 15
(11, 32) 24
(30, 143) 36
(24, 30) 35
(30, 50) 35
(30, 214) 35
(30, 202) 27
(30, 132) 18
(30, 32) 34
(24, 143) 34
(50, 143) 34
(143, 214) 36
(143, 202) 27
(132, 143) 17
(32, 143) 34
(24, 50) 45
(24, 214) 33
(24, 202) 25
(24, 132) 17
(24, 32) 36
(50, 214) 34
(50, 202) 25
(50, 132) 17
(32, 50) 35
(202, 214) 28
(132, 214) 18
(32, 214) 36
(132, 202) 18
(32, 202) 26
(32, 132) 17
(113, 118) 13
(118, 216) 12
(33, 118) 5
(118, 206) 12
(118, 190) 10
(62, 1

In [25]:
topic_popularity = {}
for user, tweet, tweet_topics in classificated_tweets:
    for t in tweet_topics:
        if t in topic_popularity:
            topic_popularity[t] += 1
        else: 
            topic_popularity[t] = 1
for key, value in topic_popularity.items():
    print(key,value)
with open("topic_popularity.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Topic", "Popularity"])  # Header
    for topic, weight in topic_popularity.items():
        writer.writerow([topic, weight])

print("Edges saved to topic_popularity.csv")

191 148
59 282
54 295
146 93
50 325
12 251
92 258
69 195
135 103
89 167
21 650
10 991
1 2189
13 1358
31 609
155 86
8 1001
4 1682
41 289
144 76
0 2327
68 330
2 1613
37 338
83 289
48 340
129 237
32 389
23 399
62 182
77 279
189 147
17 479
82 220
174 197
116 147
110 108
14 799
44 482
30 565
25 415
3 1440
177 212
125 184
187 233
145 214
78 395
151 139
120 320
169 312
46 482
179 185
28 405
175 170
170 99
197 92
218 126
97 178
113 153
16 552
122 229
199 230
193 193
7 877
198 147
154 111
107 79
137 96
9 861
100 237
205 158
101 305
51 218
106 149
52 219
22 386
190 95
182 180
5 1121
111 203
35 325
27 487
108 143
134 168
47 341
153 111
29 288
150 169
127 218
49 291
19 539
130 220
64 317
131 179
149 206
88 263
102 205
63 265
6 967
45 330
216 85
79 94
165 107
84 373
72 50
192 25
36 342
143 274
38 284
195 43
128 236
109 208
86 182
221 166
159 255
206 198
202 146
73 229
171 253
208 93
80 222
167 107
95 248
20 455
33 455
181 182
55 356
56 182
70 238
112 240
42 308
138 185
203 177
148 171
141 233
211 1

NameError: name 'csv' is not defined

In [17]:
import csv

In [27]:
# combine both arrays
combined = dict(user_topic_combinations)  # Start with the first dictionary

for key, value in topic_combinations.items():
    if key in combined:
        combined[key] += value * 2  # Add values if key exists
    else:
        combined[key] = value * 2  # Otherwise, add the new key-value pair

In [28]:
with open("graph_data.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Topic1", "Topic2", "Weight"])  # Header
    for (topic1, topic2), weight in combined.items():
        writer.writerow([topic1, topic2, weight])

print("Edges saved to graph_data.csv")

Edges saved to topics_edges.csv


In [25]:
#store topics in file
import json
topics_dict = {
    topic_id: [word[0] for word in words]
    for topic_id, words in topic_model.get_topics().items()
    if topic_id != -1  # Optional: skip outlier topic -1
}

# Save to JSON
with open("topics.json", "w", encoding="utf-8") as f:
    json.dump(topics_dict, f, indent=2, ensure_ascii=False)