In [1]:
import pandas as pd
from pathlib import Path 
import re
from collections import Counter
import nltk 
import string 
nltk.download('punkt')
from statistics import median
from statistics import mean
from lingua import Language, LanguageDetectorBuilder
import spacy
nlp = spacy.load("en_core_web_sm")
import torch
from transformers import BertTokenizer, BertModel

# need to downgrade numpy to before 2.0 
# on windows, need to enable long paths : https://www.microfocus.com/documentation/filr/filr-4/filr-desktop/t47bx2ogpfz7.html 
# also need to do through REGEDIT on windows
## if using windows 10, add gpedit.msc this way: https://www.reddit.com/r/AnnoyingTech/comments/ojru3t/adding_gpeditmsc_on_your_windows_home/

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emzou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#df = pd.read_csv("11_6_fulldataset.csv", index_col= 0)
df1 = pd.read_csv("11_6_fulldatapart1.csv")
df2 = pd.read_csv("11_6_fulldatapart2.csv")
df= pd.concat([df1, df2], ignore_index = True)           

In [14]:
## categorizing quoted by 
def process_quotes(s):
    if "Quoted By" in s: 
        return re.findall(r'>>(\d+)\n', s)
    else:
        modified_string = s  # no modification needed if "Quoted By" is not present
        return "No Quote"

df['quotedby'] = df['Identifier'].apply(process_quotes)

## removing it from the text 

def stripper (s): 
    if 'Quoted By' in s:
        cleaned_string = re.sub(r'Quoted By:|>>\d+\n', '', s)
        return cleaned_string.strip()
    else: 
        return s

df ['Text'] = df['Text'].apply(stripper)

In [15]:
## getting the reply-to out 

df['replyto'] = df['Text'].apply(lambda text: re.findall(r'>>(\d+)', text))

In [16]:
df['Text'] = df['Text'].apply(lambda text: re.sub(r'>>\d+\s*', '', text).strip())

In [17]:
# strip website links from the text
# it means 'image of god' in latin 
sitepattern = r'(?:https?://|www\.)\S+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(?:/[^\s]*)?'
df['Text'] = df['Text'].apply(lambda text: re.sub(sitepattern, '', text).strip())
# strip 'imago dei' comments from the text
df = df[~df['Text'].str.contains('imago', case=False, na=False)]
df = df[~df['Text'].str.contains('amplissimus', case=False, na=False)]

In [18]:
# there's so much more latin than i thought 
# lingua-py (https://github.com/pemistahl/lingua-py)
languages = [Language.LATIN, Language.ENGLISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

In [19]:
def latin_exterminator(s):
    confidence_value = detector.compute_language_confidence(s, Language.LATIN)
    cv = float(f"{confidence_value:.2f}") 
    if cv >= 0.5:
        return None
    else: 
        return s

In [20]:
#use the latin exterminator
df['Text'] = df['Text'].apply(latin_exterminator)
df = df[df['Text'].notnull()]

In [21]:
# drop duplicates by anon-id (this only refers to the post, not the account)
df = df.drop_duplicates(subset = 'anonid', keep = 'last')

In [22]:
# sentence detection based on punctuation
# convert string into list of strings, separated by sentences

def sentsplit (text):
    pattern = r'[^.!?]*[.!?]'
    sentences = [sentence.strip() for sentence in re.findall(pattern, text)]
    return sentences 

df['Text_Sent'] = df['Text'].apply(sentsplit)

In [32]:
nlp = spacy.load("en_core_web_sm", disable = ["ner", "tagger"])
def detect_sentences_spacy_pipe(text): 
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]



In [35]:
df ['Sent'] = df['Text'].apply(detect_sentences_spacy_pipe)



In [36]:
# get the differences between manual and spacy sentence split 
def you_a_mismatch(row):
    mismatches = sum(1 for m, s in zip(row['Text_Sent'], row['Sent']) if m != s)
    mismatches += abs(len(row['Text_Sent']) - len(row['Text_Sent']))
    return mismatches 

In [37]:
df['Mistmatch'] = df.apply(lambda r: you_a_mismatch(r), axis = 1)
df

Unnamed: 0,Identifier,Text,anonid,date,number,quotedby,replyto,Text_Sent,Sent,Mistmatch
16,:RqQXr/xt Sat 01 Oct 2022 13:20:28 No.397859125,Tumblr girls were the nerdy outcasts who went ...,:RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,No Quote,[],[Tumblr girls were the nerdy outcasts who went...,[Tumblr girls were the nerdy outcasts who went...,0
17,:+XF1CsQm Sat 01 Oct 2022 12:41:09 No.397853920,"what is the topic, then?\nYou're like the nigg...",:+XF1CsQm,Sat 01 Oct 2022 12:41:09,397853920,No Quote,[397853706],"[what is the topic, then?]","[what is the topic, then?, You're like the nig...",0
19,:Ytn2j+6s Sat 01 Oct 2022 10:01:18 No.397834023,Lots of anons posting ITT about “making it” in...,:Ytn2j+6s,Sat 01 Oct 2022 10:01:18,397834023,No Quote,[397814529],[Lots of anons posting ITT about “making it” i...,[Lots of anons posting ITT about “making it” i...,15
31,:O8h7xH1H Sun 02 Oct 2022 16:31:37 No.39803282...,These are the three pillars of the US and west...,:O8h7xH1H,Sun 02 Oct 2022 16:31:37,398032820,No Quote,[],[These are the three pillars of the US and wes...,[These are the three pillars of the US and wes...,0
33,:JsLtzO4W Sun 02 Oct 2022 11:32:05 No.397991514,"Haha, indeed. The DEI mind virus has infected ...",:JsLtzO4W,Sun 02 Oct 2022 11:32:05,397991514,No Quote,[397984711],"[Haha, indeed., The DEI mind virus has infecte...","[Haha, indeed., The DEI mind virus has infecte...",0
...,...,...,...,...,...,...,...,...,...,...
70803,:kecsvdvI Wed 30 Oct 2024 22:14:55 No.48646562...,THE CAN BARELY GASLIGHT THIS WITHIN THEIR OWN ...,:kecsvdvI,Wed 30 Oct 2024 22:14:55,486465624,No Quote,[],[],[THE CAN BARELY GASLIGHT THIS WITHIN THEIR OWN...,0
70804,:EF5Mz2zI Wed 30 Oct 2024 22:02:24 No.48646490...,Here's my objective assessment of Stephen Mill...,:EF5Mz2zI,Wed 30 Oct 2024 22:02:24,486464907,No Quote,[],[Here's my objective assessment of Stephen Mil...,[Here's my objective assessment of Stephen Mil...,8
70805,:cUMz4T2R Wed 30 Oct 2024 21:55:11 No.48646451...,There are a bunch of reasons. There are many f...,:cUMz4T2R,Wed 30 Oct 2024 21:55:11,486464519,No Quote,[],"[There are a bunch of reasons., There are many...","[There are a bunch of reasons., There are many...",3
70806,:0egXPX4Z Wed 30 Oct 2024 21:34:59 No.48646320...,">But what does this mean?\nDEI, troons, wars, ...",:0egXPX4Z,Wed 30 Oct 2024 21:34:59,486463202,No Quote,[],"[>But what does this mean?, DEI, troons, wars,...","[>But what does this mean?, DEI, troons, wars,...",0


In [42]:
# refine mismatches later. let's see if we can see anything using this method

fdf = df[df['Mistmatch'] == 0]

In [45]:
def tokenize_list (listss): 
    list1 = [nltk.word_tokenize(m) for m in listss] # this gives us a list of tokenized lists
    def lower_case (m):
        return [s.lower() for s in m if s not in string.punctuation]
    list2 = [lower_case(m) for m in list1] 
    def fix_contractions(tokens):
        contraction_suffixes = ["'t", "'m", "'ve", "'ll", "'d", "'re", "'s", "n't"]
        fixed_tokens = []
        i = 0
        while i < len(tokens):
            if i < len(tokens) - 1 and tokens[i + 1] in contraction_suffixes:
                if tokens[i + 1] in ["'t", "n't"]:
                    fixed_tokens.append(tokens[i])  
                else:
                    fixed_tokens.append(tokens[i] + tokens[i + 1])
                i += 1
            else:
                fixed_tokens.append(tokens[i])
            i += 1
        return fixed_tokens
    list3 = [fix_contractions(m) for m in list2]
    return list3


In [48]:
fdf['Tokens'] = fdf['Sent'].apply(tokenize_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fdf['Tokens'] = fdf['Sent'].apply(tokenize_list)


In [60]:
#exploding the df to make each sentence its own entry 
# may need to see if there any trends in this ... 

fdf['Tokens_E'] = fdf['Tokens'].apply(lambda x: [m for m in x])
fdf_E = fdf.explode('Tokens_E').reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fdf['Tokens_E'] = fdf['Tokens'].apply(lambda x: [m for m in x])


In [96]:
da = fdf_E[fdf_E['Tokens_E'].apply(lambda x: "dei" in x)]
da

Unnamed: 0,Identifier,Text,anonid,date,number,quotedby,replyto,Text_Sent,Sent,Mistmatch,Tokens,Tokens_E
0,:RqQXr/xt Sat 01 Oct 2022 13:20:28 No.397859125,Tumblr girls were the nerdy outcasts who went ...,:RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,No Quote,[],[Tumblr girls were the nerdy outcasts who went...,[Tumblr girls were the nerdy outcasts who went...,0,"[[tumblr, girls, were, the, nerdy, outcasts, w...","[tumblr, girls, were, the, nerdy, outcasts, wh..."
6,:+XF1CsQm Sat 01 Oct 2022 12:41:09 No.397853920,"what is the topic, then?\nYou're like the nigg...",:+XF1CsQm,Sat 01 Oct 2022 12:41:09,397853920,No Quote,[397853706],"[what is the topic, then?]","[what is the topic, then?, You're like the nig...",0,"[[what, is, the, topic, then], [you're, like, ...","[just, robbing, and, dispossesing, innocent, c..."
13,:JsLtzO4W Sun 02 Oct 2022 11:32:05 No.397991514,"Haha, indeed. The DEI mind virus has infected ...",:JsLtzO4W,Sun 02 Oct 2022 11:32:05,397991514,No Quote,[397984711],"[Haha, indeed., The DEI mind virus has infecte...","[Haha, indeed., The DEI mind virus has infecte...",0,"[[haha, indeed], [the, dei, mind, virus, has, ...","[the, dei, mind, virus, has, infected, every, ..."
18,:H47Qry4L Sun 02 Oct 2022 10:40:47 No.397984711,"This is correct. Like anywhere else, the US ar...",:H47Qry4L,Sun 02 Oct 2022 10:40:47,397984711,No Quote,[],"[This is correct., Like anywhere else, the US ...","[This is correct., Like anywhere else, the US ...",0,"[[this, is, correct], [like, anywhere, else, t...","[our, job, is, to, just, be, filler, meat, so,..."
20,:9ILswUs7 Sun 02 Oct 2022 08:20:57 No.397967825,is he Opus dei? Our president is as well and g...,:9ILswUs7,Sun 02 Oct 2022 08:20:57,397967825,No Quote,[397967177],"[is he Opus dei?, Our president is as well and...","[is he Opus dei?, Our president is as well and...",0,"[[is, he, opus, dei], [our, president, is, as,...","[is, he, opus, dei]"
...,...,...,...,...,...,...,...,...,...,...,...,...
84529,:ThnMw6f1 Wed 30 Oct 2024 23:47:00 No.48647052...,The US army is dead because no one is going to...,:ThnMw6f1,Wed 30 Oct 2024 23:47:00,486470524,No Quote,[],[The US army is dead because no one is going t...,[The US army is dead because no one is going t...,0,"[[the, us, army, is, dead, because, no, one, i...","[milley, and, the, dei, minions, basically, se..."
84534,:jFsm+bis Wed 30 Oct 2024 22:54:11 No.48646772...,>We’ve mapped them for the first time and foun...,:jFsm+bis,Wed 30 Oct 2024 22:54:11,486467727,No Quote,[],[>We’ve mapped them for the first time and fou...,[>We’ve mapped them for the first time and fou...,0,"[[we, ’, ve, mapped, them, for, the, first, ti...","[he's, in, charge, of, dei, hiring, in, the, w..."
84537,:xjZZ6riN Wed 30 Oct 2024 22:51:37 No.48646758...,Lol. Kamala is a DEI hire.\n\nPost\nReport,:xjZZ6riN,Wed 30 Oct 2024 22:51:37,486467580,No Quote,[],"[Lol., Kamala is a DEI hire.]","[Lol., Kamala is a DEI hire., Post\nReport]",0,"[[lol], [kamala, is, a, dei, hire], [post, rep...","[kamala, is, a, dei, hire]"
84539,:kecsvdvI Wed 30 Oct 2024 22:14:55 No.48646562...,THE CAN BARELY GASLIGHT THIS WITHIN THEIR OWN ...,:kecsvdvI,Wed 30 Oct 2024 22:14:55,486465624,No Quote,[],[],[THE CAN BARELY GASLIGHT THIS WITHIN THEIR OWN...,0,"[[the, can, barely, gaslight, this, within, th...","[the, can, barely, gaslight, this, within, the..."


In [None]:
## https://www.ling.upenn.edu/~dringe/CorpStuff/Thesis/IntroSyntax.html maybe try a naive method, using POS tagging

In [103]:
da.to_csv("nov11.csv")

In [68]:
def capture_tokens(df, word, before, after):
    mentions = [
        (word, row['date'], 
         row['Tokens_E'][max(i - before, 0):i],  
         row['Tokens_E'][i + 1:i + 1 + after],  
         len(row['Tokens_E']))  
        for _, row in df.iterrows()  
        for i, token in enumerate(row['Tokens_E'])  
        if token == word  # Match the specific word
    ]
    return mentions

In [97]:
unigram = capture_tokens(da, 'dei', 1, 1)

In [98]:
# get the dictionaries for before and after 
# this should be integrated with the above function later
def abdict (m): 
    unilist_list = [x[m] for x in unigram]
    uniflat = [x for l in unilist_list for x in l] #flatten the list poggers
    unidict = dict(Counter(uniflat))
    unidict1 = {k:v for k,v in unidict.items() if v >1} # change v depending on what we want to exclude 
    sortubdict = dict(sorted(unidict1.items(), key=lambda item: item[1], reverse = True))
    return sortubdict 

In [99]:
candidate_words = list(abdict(3).keys()) 

In [100]:
candidate_words = list(set(candidate_words))

In [101]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
target_word = "dei"

inputs_target = tokenizer(target_word, return_tensors = 'pt')
inputs_candidates = tokenizer(candidate_words, padding = True, truncation = True, return_tensors = 'pt')

In [102]:
with torch.no_grad():
    target_embedding = model (**inputs_target).last_hidden_state.mean(dim = 1).squeeze().numpy()
    candidate_embeddings = model (**inputs_candidates).last_hidden_state.mean(dim= 1).numpy()

In [None]:
similarities = cosine_similarity([target_embedding], candidate_embeddings)

In [None]:
for idx, similarity in enumerate (similarities[0]): 
    print (f"Similarity with '{candidate_words[idx]}': {similarity:.4f}")

In [None]:
#lets test the hypothesis - lets see if theres any trend with 'hire' and 'hires' ... so NOUN NOUN 
# is 'dei hire' a bigram or is 'hire' interchangeable 
# which words are (significantly) semantically similar to 'hire' 
# do the above via cosine similarity (i think this already exists) 
# if we find 'rules' for how to use a word, how predictable are they?

In [89]:
from sentence_transformers import SentenceTransformer 
import numpy as np 

model = SentenceTransformer('all-MiniLM-L6-v2')

In [92]:
target_sequence = ['dei', 'hire']
target_df = fdf_E[fdf_E['Tokens_E'].apply(lambda x: isinstance(x, list) and target_sequence in [x[i:i+2] for i in range(len(x)-1)])]

sentences_hire = [m for m in target_df['Tokens_E']]
allsentences = [m for m in fdf_E['Tokens']]

sh = [' '.join(s) for s in sentences_hire]
ah = [' '.join(s) for m in allsentences for s in m]

embeddings_dei_hire = model.encode(["dei hire"])[0]
embeddings_dei = model.encode(ah)

In [93]:
similarities = cosine_similarity([embeddings_dei_hire], embeddings_dei)

In [None]:
for idx, similarity in enumerate (similarities[0]):
    print (f"similarity with sentence {idx+1}: {similarity:.4f} - {ah[idx]}")

In [None]:
def oneword_datecheck(word):
    return [m for m in unigram if word in m[3]]

#oneword_datecheck("hire")

In [73]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [77]:
target_sequence = ['dei', 'hire']
target_df = fdf_E[fdf_E['Tokens_E'].apply(lambda x: isinstance(x, list) and target_sequence in [x[i:i+2] for i in range(len(x)-1)])]


In [75]:
def get_all_word_embeddings(tokens):
    sentence = " ".join(tokens)
    inputs = tokenizer(sentence, return_tensors="pt", is_split_into_words=False)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    # get bert embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # get embeddings for each word
    embeddings = outputs.last_hidden_state[0]
    bert_tokens = tokenizer.tokenize(sentence)

    # map tokens to embeddings
    word_embeddings = {}
    for i, token in enumerate(bert_tokens):
        word_embeddings[token] = embeddings[i]

    return word_embeddings

In [79]:
sentences_hire = [m for m in target_df['Tokens_E']]

In [81]:
all_embeddings = []

In [82]:
for tokens in sentences_hire:
    embeddings = get_all_word_embeddings(tokens)
    all_embeddings.append(embeddings)

In [84]:
from sklearn.cluster import KMeans
import numpy as np

dei_embeddings = []
other_embeddings = []

for em in all_embeddings: 
    for word, em in em.items():
        if word == "hire": 
            dei_embeddings.append(em.numpy())
        else: 
            other_embeddings.append((word, em.numpy())) 

In [85]:
dei_embeddings = np.array(dei_embeddings)
all_embeddings = np.array([emb for _, emb in other_embeddings])

In [87]:
kmeans = KMeans (n_clusters = 5)
kmeans.fit(all_embeddings) 
 
clusters = kmeans.predict(all_embeddings)
clustered_words = {}
for (word, emb), cluster in zip(other_embeddings, clusters):
    if cluster not in clustered_words: 
        clustered_words[cluster] = []
    clustered_words[cluster].append(word)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

avg_word_embedding = np.mean(dei_embeddings, axis = 0).reshape(1, -1)

similar_clusters = {}
for cluster_id, words in clustered_words.items():
    cluster_center = kmeans.cluster_centers_[cluster_id].reshape(1,-1)
    similiarity_score = cosine_similarity(avg_word_embedding, cluster_center).item()

    if similiarity_score > 0.7: 
        similar_clusters[cluster_id] = (words, similiarity_score)


print ("words in similar semantic space as 'hire'")
for cluster_id, (words, score) in similar_clusters.items(): 
    print (f"cluster {cluster_id} (similarity: {score:.4f}): {words}")

words in similar semantic space as 'hire
cluster 3 (similarity: 0.7408): ['flip', '##s', 'with', 'experience', 'ca', 'fix', 'of', 'every', 'report', 'fake', 'or', '##fi', 'this', 'was', "'", 'new', 'dei', 'a', 'everyone', '120', 'iq', 'beyond', 'they', 'for', 'his', 'black', 'cause', 'i', 'societies', 'are', 'that', '##e', 'even', "'", 'down', 'on', 'the', 'dei', 'mandates', 'and', 'seminars', 'to', 'your', 'you', 'dei', 'would', 'sox', 'and', 'they', 'and', 'back', 'post', 'report', '-', 'dei', 'government', 'ni', '##s', 'this', 'one', '/', 'other', 'dei', 'who', 'to', '9', '-', 'so', 'on', 'dei', 'trying', 'ideas', 'for', 'a', '##point', 'those', 'that', 'will', 'ca', 'dei', 'post', 'report', 'top', 'already', 'suspect', 'for', 'quality', 'issues', 'report', 'of', 'dei', 'that', 'with', 'before', 'they', 'drown', 'dei', '##6', 'pilot', 'post', 'report', 'for', 'every', '##g', 'and', 'dei', 'there', 'subject', 'matter', 'news', '##om', 'appoint', '##s', 'dei', 'she', 'the', '##ment', 