In [37]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.stats import kendalltau
import matplotlib.pyplot as plt
import random

# from wordcloud import WordCloud

In [38]:
def U_normalize(Adj):
    return normalize(Adj, norm='l1', axis=1)

def V_normalize(Adj):
    return normalize(Adj, norm='l1', axis=0)

In [39]:
dataset = ['abortion','gun'][1]

print(f"The dataset is {dataset}")

df_words_list = pd.read_parquet(f'output/{dataset}/list_of_words.parquet')
df_topics = pd.read_parquet(f'output/{dataset}/topics.parquet')
df_tweets = pd.read_parquet(f'output/{dataset}/tweets.parquet')
df_users = pd.read_parquet(f'output/{dataset}/uid_to_index.parquet')
df_label = pd.read_feather(f'data/{dataset}/allsides.feather')

The dataset is gun


In [40]:
TW2U = sp.load_npz(f'output/{dataset}/Tweets2Users_sparse.npz')
TW2W = sp.load_npz(f'output/{dataset}/X_Tweets_sparse.npz')
TW2T = sp.load_npz(f'output/{dataset}/Tweets2Topics_sparse.npz')

TW2U.shape, TW2W.shape, TW2T.shape

((117327, 4404), (117327, 8818), (117327, 61))

In [41]:
Users_Labels = np.load(f'output/{dataset}/users_labels_(Same_word_as_Tweets2Users).npy')
Users_Labels.shape

(4404,)

In [42]:
Users_Labels

array([1, 1, 1, ..., 0, 0, 0])

In [43]:
# Count the 0s and the 1s
f"Num of 0s: {np.sum(Users_Labels == 0)}, Num of 1s: {np.sum(Users_Labels == 1)}"

'Num of 0s: 2237, Num of 1s: 2167'

In [44]:
user_ids = df_users['user_id'].values
user_ids.shape

(4404,)

In [45]:
uid2label = dict(zip(user_ids, Users_Labels))
len(uid2label)

4404

In [46]:
df_tweets['label'] = df_tweets['user_id'].map(uid2label)
df_tweets['label'].value_counts()

label
0    60699
1    56628
Name: count, dtype: int64

In [47]:
tweet_label = df_tweets['label'].values
tweet_label.shape

(117327,)

In [48]:
words_list = df_words_list['word'].values

In [49]:
def sample_tweets(topic: str, label: str, keyword: str, num = 10) -> list[str]:
    """
    Returns a list of 10 tweets that match the specified topic, label, and keyword.
    
    Args:
        topic (str): The topic of the tweets.
        label (str): The label of the tweets.
        keyword (str): The keyword to search for in the tweets.

    Returns:
        list[str]: A list of 10 tweet IDs that match the specified criteria.
    """

    # Filter the tweets by topic and label
    filtered_tweets = df_tweets.query(f"topic == @topic and label == @label")
    
    # Filter the remaining tweets to contain the keyword (case-insensitive)
    matching_tweets = filtered_tweets[df_tweets.tweet.str.contains(keyword, case=False)]
    
    # print(matching_tweets.shape)
    
    if matching_tweets.shape[0] < num:
        return matching_tweets
    else:
        return matching_tweets.sample(num)


# Keywords of Community:

In [50]:
T2TW = TW2T.T


# T -> TW -> W
T2TW_0 = T2TW[:, tweet_label == 0]
TW2W_0 = TW2W[tweet_label == 0, :]
# T2TW_0.shape, TW2W_0.shape
PM_L = U_normalize(T2TW_0)
PM_R = V_normalize(TW2W_0)

HS_T2W_C0 = cosine_similarity(PM_L, PM_R.T)

In [51]:
T2TW_1 = T2TW[:, tweet_label == 1]
TW2W_1 = TW2W[tweet_label == 1, :]
PM_L = U_normalize(T2TW_1)
PM_R = V_normalize(TW2W_1)

HS_T2W_C1 = cosine_similarity(PM_L, PM_R.T)

In [52]:
# All 
PM_L = U_normalize(T2TW)
PM_R = V_normalize(TW2W)

HS_T2W_ALL = cosine_similarity(PM_L, PM_R.T)

In [53]:
def get_topic_words(topic_id, print_out = False) -> tuple:
    topic_name = df_topics['Name'][topic_id]
    
    # The topic words
    all_words_ids = np.argsort(HS_T2W_ALL[topic_id])[::-1][:10]
    topic_words = words_list[all_words_ids][:10]
    
    # The profile words
    score_c0_topic = HS_T2W_C0[topic_id] - HS_T2W_C1[topic_id]
    c0_words = words_list[np.argsort(score_c0_topic)[::-1][:10]]
    c0_scores = np.sort(score_c0_topic)[::-1][:10]
    
    score_c1_topic = HS_T2W_C1[topic_id] - HS_T2W_C0[topic_id]
    c1_words = words_list[np.argsort(score_c1_topic)[::-1][:10]]
    c1_scores = np.sort(score_c1_topic)[::-1][:10]
    
    if print_out:
        print(f"Topic name: {topic_name}")
        
        print(f"\t Topic Words: {topic_words}")
        print(f"\t C0 Words: {c0_words}")
        print(f"\t C0 Scores: {c0_scores}")
        print(f"\t C1 Words: {c1_words}")
        print(f"\t C1 Scores: {c1_scores}")

    return topic_name, topic_words, (c0_words, c0_scores), (c1_words, c1_scores)

In [54]:
_ = get_topic_words(1, print_out=True)

Topic name: 1_fbi_trump_doj_court
	 Topic Words: ['fbi' 'trump' 'doj' 'documents' 'court' 'garland' 'justice' 'mar'
 'classified' 'lago']
	 C0 Words: ['trump' 'classified' 'documents' 'donald' 'thomas' 'secret' 'indict'
 'espionage' 'garland' 'lawyers']
	 C0 Scores: [0.17126463 0.13147257 0.12955069 0.09710182 0.09000833 0.08846839
 0.08845118 0.08465685 0.08097294 0.07947844]
	 C1 Words: ['fbi' 'raid' 'dossier' 'whistleblowers' 'collusion' 'hoax' 'prisoners'
 'hillary' 'political' 'spying']
	 C1 Scores: [0.19060109 0.13710187 0.07292381 0.07126841 0.07099452 0.06496717
 0.06293037 0.05838423 0.05324517 0.04604267]


In [55]:
df_tmp = sample_tweets(1, 0, "shadow", 10)
df_tmp

  matching_tweets = filtered_tweets[df_tweets.tweet.str.contains(keyword, case=False)]


Unnamed: 0,user_id,tweet,topic,label
1828,50401467,"“Not a shadow of a doubt,” said his attorney, ...",1,0
7344,27386099,"Clarence Thomas, Lindsey Graham, shadow docket...",1,0
41298,1294339508687654914,New Trump special counsel launches investigati...,1,0
65421,1122907600465608706,"Trump is running a shadow presidency, threaten...",1,0
80421,48508105,"""Shadow docket use by the - conservative major...",1,0
88595,1345821572255379458,"GLENS FALLS, N.Y. — Matt Castelli has spent mu...",1,0


In [56]:
df_tmp.tweet.values

array(['“Not a shadow of a doubt,” said his attorney, probably',
       'Clarence Thomas, Lindsey Graham, shadow docket… yeah this turned out as expected.',
       'New Trump special counsel launches investigation in Mueller’s shadow The Justice Department hopes to avoid pitfalls of the last special counsel investigation targeting Trump.',
       'Trump is running a shadow presidency, threatening the Biden administration with disruptive protests like the Canadian trucking disruption. He needs to be charged with crimes against the United States immediately.',
       '"Shadow docket use by the - conservative majority court also became a political foil with Senate Democrats calling it a potential abuse." "...Approximately % of those emergency docket orders were unanimous, with no justice noting their dissent."',
       'GLENS FALLS, N.Y. — Matt Castelli has spent much of his career in the shadows. Over nearly  years at the CIA, he hunted down terrorists in one way or another...'],
      d

In [57]:
dfy = df_tweets.query(f"topic == {1} and label == {0}")
dfy

Unnamed: 0,user_id,tweet,topic,label
114,1328055791769837569,One thing is clear: We must #ArrestTrump,1,0
133,1328055791769837569,"If we elect more Democratic Senators, we have ...",1,0
134,1328055791769837569,NEWS: Former Trump Chief of Staff Mark Meadows...,1,0
141,1328055791769837569,"President Biden s SCOTUS nominee, Judge Ketanj...",1,0
145,1328055791769837569,Who else agrees that Clarence Thomas is compro...,1,0
...,...,...,...,...
117311,1397638838,Did Garland ever have any intention to prosecu...,1,0
117313,1397638838,Unless Garland never had any intention to pros...,1,0
117314,1397638838,Biden MUST replace Garland since his letter to...,1,0
117316,1397638838,Biden MUST replace Garland since his letter to...,1,0


In [58]:
# To get a User to Word matrix we need to multiply the User to Tweet matrix with the Tweet to Word matrix
U2TW = TW2U.T
U2W = U2TW @ TW2W
U2W.shape

(4404, 8818)

In [60]:
# Convert U2W to binary matrix: 0 if the user does not use the word, 1 if the user uses the word
U2W_bin = U2W.copy()
U2W_bin[U2W_bin > 0] = 1
U2W_bin[0].toarray()[U2W_bin[0].toarray() > 0]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [61]:
U2W_bin_0 = U2W_bin[Users_Labels == 0]
U2W_bin_1 = U2W_bin[Users_Labels == 1]
U2W_bin_0.shape, U2W_bin_1.shape

((2237, 8818), (2167, 8818))

In [62]:
# Calculate the ratio of users that use the word
ratio_0 = U2W_bin_0.sum(axis=0) / U2W_bin_0.shape[0]
ratio_1 = U2W_bin_1.sum(axis=0) / U2W_bin_1.shape[0]
ratio_0.shape, ratio_1.shape

((1, 8818), (1, 8818))

In [63]:
# Flatten the ratio arrays
ratio_0 = ratio_0.A1
ratio_1 = ratio_1.A1
ratio_0.shape, ratio_1.shape

((8818,), (8818,))

In [64]:
ratio_0

array([0.00849352, 0.00268216, 0.00625838, ..., 0.00402325, 0.00312919,
       0.00581135])

In [65]:
df_tweets

Unnamed: 0,user_id,tweet,topic,label
0,19937569,See. Proof. That conservatives are colluding w...,2,1
1,19937569,What say about release of #DoctorOfDeath #DrOf...,2,1
2,19937569,ALERT! Senator @SenGillibrand calls on DOJ and...,2,1
3,19937569,EVERY Gun Control Law INFRINGES upon the Const...,2,1
4,19937569,Nicholas Sandmann asks @elonmusk to release hi...,0,1
...,...,...,...,...
117322,881967184636895238,The opposite of prolife is deathcon,10,0
117323,881967184636895238,"Twitter be like ""oh you re viewing this profil...",0,0
117324,881967184636895238,"They start with ""dress code"" and escalate to ""...",15,0
117325,881967184636895238,All this bitcoin farming generating runway mel...,36,0


In [66]:
tweet_topic = df_tweets['topic'].values

In [67]:
def get_topic_words_with_ratio(topic_id, print_out = False, limit = 10) -> tuple:
    
    # Compute the ratio of users that use the word
    U2TW = TW2U.T
    
    
    # keep the tweets about the topic
    U2TW_t = U2TW[:, tweet_topic == topic_id]
    TW2W_t = TW2W[tweet_topic == topic_id, :]
    
    U2W_t = U2TW_t @ TW2W_t
    
    U2W_bin_t = U2W_t.copy()
    U2W_bin_t[U2W_bin_t > 0] = 1
    
    U2W_bin_0_t = U2W_bin_t[Users_Labels == 0, :]
    U2W_bin_1_t = U2W_bin_t[Users_Labels == 1, :]
    
    ratio_0 = U2W_bin_0_t.sum(axis=0) / U2W_bin_0_t.shape[0]
    ratio_1 = U2W_bin_1_t.sum(axis=0) / U2W_bin_1_t.shape[0]
    
    ratio_0 = ratio_0.A1
    ratio_1 = ratio_1.A1
    
    
    topic_name = df_topics['Name'][topic_id]
    
    # The topic words
    all_words_ids = np.argsort(HS_T2W_ALL[topic_id])[::-1][:limit]
    topic_words = words_list[all_words_ids][:limit]
    
    # The profile words
    score_c0_topic = (HS_T2W_C0[topic_id] - HS_T2W_C1[topic_id]) * ratio_0
    c0_words = words_list[np.argsort(score_c0_topic)[::-1][:limit]]
    c0_scores = np.sort(score_c0_topic)[::-1][:limit]
    
    score_c1_topic = (HS_T2W_C1[topic_id] - HS_T2W_C0[topic_id]) * ratio_1
    c1_words = words_list[np.argsort(score_c1_topic)[::-1][:limit]]
    c1_scores = np.sort(score_c1_topic)[::-1][:limit]
    
    if print_out:
        print(f"Topic name: {topic_name}")
        
        print(f"\t Topic Words: {topic_words}")
        print(f"\t C0 Words: {c0_words}")
        print(f"\t C0 Scores: {c0_scores}")
        print(f"\t C1 Words: {c1_words}")
        print(f"\t C1 Scores: {c1_scores}")

    return topic_name, topic_words, (c0_words, c0_scores), (c1_words, c1_scores)

In [68]:
_ = get_topic_words_with_ratio(13, print_out=True, limit=10)

Topic name: 13_border_illegal_migrants_southern
	 Topic Words: ['border' 'migrants' 'illegal' 'immigration' 'southern' 'illegals'
 'immigrants' 'fentanyl' 'patrol' 'mexico']
	 C0 Words: ['immigration' 'migrants' 'mexico' 'migrant' 'immigrant' 'desantis'
 'reform' 'administration' 'stunt' 'seeking']
	 C0 Scores: [0.00140815 0.00136187 0.00132275 0.00108773 0.00034341 0.00023977
 0.00023498 0.00022784 0.00017868 0.00015617]
	 C1 Words: ['border' 'illegal' 'illegals' 'biden' 'amnesty' 'aliens' 'open'
 'invasion' 'southern' 'borders']
	 C1 Scores: [0.02059171 0.01854172 0.00935637 0.00734464 0.00705632 0.00634225
 0.00483362 0.0043232  0.00411758 0.00355266]


In [69]:
def join_and_Capitalize(str_list : list[str]) -> str:
    # Join after cpitalizing the first letter of each word
    return ', '.join([word.capitalize() for word in str_list])

join_and_Capitalize(['hello', 'world'])

'Hello, World'

In [70]:

records = []

for t in range(len(df_topics)):
    topic_name, topic_words, (c0_words, c0_scores), (c1_words, c1_scores) = get_topic_words_with_ratio(t, limit=10)
    
    r = {
        'topic': topic_name,
        'topic_words': join_and_Capitalize(topic_words[:7]),
        'c0_words': join_and_Capitalize(c0_words[:7]),
        'c1_words': join_and_Capitalize(c1_words[:7]),
    }
    
    records.append(r)

df_profile_words = pd.DataFrame(records, columns=['topic', 'topic_words', 'c0_words', 'c1_words'])
df_profile_words

Unnamed: 0,topic,topic_words,c0_words,c1_words
0,0_twitter_musk_elon_elonmusk,"Twitter, Elon, Musk, Elonmusk, Tweet, Tweets, ...","Musk, Elon, Twitter, New, Follow, Platform, Site","Elonmusk, Speech, Free, Censorship, Files, Cen..."
1,1_fbi_trump_doj_court,"Fbi, Trump, Doj, Documents, Court, Garland, Ju...","Trump, Documents, Court, Classified, Donald, G...","Fbi, Raid, Political, Collusion, Hillary, Hoax..."
2,2_gun_guns_police_shooting,"Gun, Police, Guns, Shooting, Shot, Assault, We...","Shooting, Violence, Police, School, Mass, Kill...","Control, Gun, Owners, Rights, Carry, Ban, Fire..."
3,3_vote_election_republicans_gop,"Vote, Election, Gop, Republicans, Democrats, P...","Democracy, Republicans, Gop, Vote, Blue, Repub...","Democrats, Election, Democrat, Ballots, Electi..."
4,4_stupid_know_yes_don,"Stupid, Know, Lie, Yes, Don, Liar, True","Let, Hope, Right, Just, Mean, Need, Fucking","Don, Think, People, Lie, Lies, Truth, Know"
...,...,...,...,...
56,56_security_social_medicare_seniors,"Security, Social, Medicare, Seniors, Ss, Benef...","Security, Social, Medicare, Republicans, Gop, ...","Ss, Recipients, Income, Elimination, Increase,..."
57,57_mask_masks_wearing_wear,"Mask, Masks, Wear, Wearing, Masking, Mandate, ...","Wear, Masks, Covid, Mask, Mandate, Flu, Wearing","Masking, Work, Masked, La, Driving, Science, H..."
58,58_today_moment_currently_feel,"Moment, Moon, Leo, Today, Currently, Spiritual...","Leo, Currently, Universe, Feel, Usual, Yo, Ach...","Prospect, Peer, Rude, Appearance, Recognition,..."
59,59_left_right_leftists_leftist,"Left, Leftists, Leftist, Right, Wing, Radical,...","Right, Wing, Close, Far, Sides, Trying, Reacti...","Left, Leftists, Leftist, Hate, Destroy, Contro..."


In [71]:
# df_profile_words.to_latex("profile_words.tex", index=False)

In [72]:
# Save as a CSV file
df_profile_words.to_csv(f'analysis/{dataset}_profile_words.csv', index=False)