In [1]:
import pandas as pd
from pathlib import Path 
import re
from collections import Counter
import nltk 
import string 
nltk.download('punkt')
from statistics import median
from statistics import mean
from lingua import Language, LanguageDetectorBuilder
import spacy
nlp = spacy.load("en_core_web_sm")
import torch
from transformers import BertTokenizer, BertModel

[nltk_data] Downloading package punkt to /Users/emilyzou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("11_6_fulldataset.csv", index_col= 0)

In [3]:
## categorizing quoted by 
def process_quotes(s):
    if "Quoted By" in s: 
        return re.findall(r'>>(\d+)\n', s)
    else:
        modified_string = s  # no modification needed if "Quoted By" is not present
        return "No Quote"

df['quotedby'] = df['Identifier'].apply(process_quotes)

## removing it from the text 

def stripper (s): 
    if 'Quoted By' in s:
        cleaned_string = re.sub(r'Quoted By:|>>\d+\n', '', s)
        return cleaned_string.strip()
    else: 
        return s

df ['Text'] = df['Text'].apply(stripper)

In [4]:
## getting the reply-to out 

df['replyto'] = df['Text'].apply(lambda text: re.findall(r'>>(\d+)', text))

In [5]:
df['Text'] = df['Text'].apply(lambda text: re.sub(r'>>\d+\s*', '', text).strip())

In [6]:
# strip website links from the text
# it means 'image of god' in latin 
sitepattern = r'(?:https?://|www\.)\S+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(?:/[^\s]*)?'
df['Text'] = df['Text'].apply(lambda text: re.sub(sitepattern, '', text).strip())


In [7]:
# strip 'imago dei' comments from the text
df = df[~df['Text'].str.contains('imago', case=False, na=False)]

In [8]:
df = df[~df['Text'].str.contains('amplissimus', case=False, na=False)]

In [9]:
df = df[~df['Text'].str.contains('amplissimus', case=False, na=False)]

In [10]:
# there's so much more latin than i thought 
# lingua-py (https://github.com/pemistahl/lingua-py)
languages = [Language.LATIN, Language.ENGLISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

In [11]:
def latin_exterminator(s):
    confidence_value = detector.compute_language_confidence(s, Language.LATIN)
    cv = float(f"{confidence_value:.2f}") 
    if cv >= 0.5:
        return None
    else: 
        return s

In [12]:
#use the latin exterminator
df['Text'] = df['Text'].apply(latin_exterminator)
df = df[df['Text'].notnull()]

In [14]:
# drop duplicates by anon-id (this only refers to the post, not the account)
df = df.drop_duplicates(subset = 'anonid', keep = 'last')
df

Unnamed: 0,Identifier,Text,anonid,date,number,quotedby,replyto
16,:RqQXr/xt Sat 01 Oct 2022 13:20:28 No.397859125,Tumblr girls were the nerdy outcasts who went ...,:RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,No Quote,[]
17,:+XF1CsQm Sat 01 Oct 2022 12:41:09 No.397853920,"what is the topic, then?\nYou're like the nigg...",:+XF1CsQm,Sat 01 Oct 2022 12:41:09,397853920,No Quote,[397853706]
19,:Ytn2j+6s Sat 01 Oct 2022 10:01:18 No.397834023,Lots of anons posting ITT about “making it” in...,:Ytn2j+6s,Sat 01 Oct 2022 10:01:18,397834023,No Quote,[397814529]
31,:O8h7xH1H Sun 02 Oct 2022 16:31:37 No.39803282...,These are the three pillars of the US and west...,:O8h7xH1H,Sun 02 Oct 2022 16:31:37,398032820,No Quote,[]
33,:JsLtzO4W Sun 02 Oct 2022 11:32:05 No.397991514,"Haha, indeed. The DEI mind virus has infected ...",:JsLtzO4W,Sun 02 Oct 2022 11:32:05,397991514,No Quote,[397984711]
...,...,...,...,...,...,...,...
70803,:kecsvdvI Wed 30 Oct 2024 22:14:55 No.48646562...,THE CAN BARELY GASLIGHT THIS WITHIN THEIR OWN ...,:kecsvdvI,Wed 30 Oct 2024 22:14:55,486465624,No Quote,[]
70804,:EF5Mz2zI Wed 30 Oct 2024 22:02:24 No.48646490...,Here's my objective assessment of Stephen Mill...,:EF5Mz2zI,Wed 30 Oct 2024 22:02:24,486464907,No Quote,[]
70805,:cUMz4T2R Wed 30 Oct 2024 21:55:11 No.48646451...,There are a bunch of reasons. There are many f...,:cUMz4T2R,Wed 30 Oct 2024 21:55:11,486464519,No Quote,[]
70806,:0egXPX4Z Wed 30 Oct 2024 21:34:59 No.48646320...,">But what does this mean?\nDEI, troons, wars, ...",:0egXPX4Z,Wed 30 Oct 2024 21:34:59,486463202,No Quote,[]


In [16]:
# sentence detection based on punctuation
# convert string into list of strings, separated by sentences

def sentsplit (text):
    pattern = r'[^.!?]*[.!?]'
    sentences = [sentence.strip() for sentence in re.findall(pattern, text)]
    return sentences 

df['Text_Sent'] = df['Text'].apply(sentsplit)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Text_Sent'] = df['Text'].apply(sentsplit)


Unnamed: 0,Identifier,Text,anonid,date,number,quotedby,replyto,Text_Sent
16,:RqQXr/xt Sat 01 Oct 2022 13:20:28 No.397859125,Tumblr girls were the nerdy outcasts who went ...,:RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,No Quote,[],[Tumblr girls were the nerdy outcasts who went...
17,:+XF1CsQm Sat 01 Oct 2022 12:41:09 No.397853920,"what is the topic, then?\nYou're like the nigg...",:+XF1CsQm,Sat 01 Oct 2022 12:41:09,397853920,No Quote,[397853706],"[what is the topic, then?]"
19,:Ytn2j+6s Sat 01 Oct 2022 10:01:18 No.397834023,Lots of anons posting ITT about “making it” in...,:Ytn2j+6s,Sat 01 Oct 2022 10:01:18,397834023,No Quote,[397814529],[Lots of anons posting ITT about “making it” i...
31,:O8h7xH1H Sun 02 Oct 2022 16:31:37 No.39803282...,These are the three pillars of the US and west...,:O8h7xH1H,Sun 02 Oct 2022 16:31:37,398032820,No Quote,[],[These are the three pillars of the US and wes...
33,:JsLtzO4W Sun 02 Oct 2022 11:32:05 No.397991514,"Haha, indeed. The DEI mind virus has infected ...",:JsLtzO4W,Sun 02 Oct 2022 11:32:05,397991514,No Quote,[397984711],"[Haha, indeed., The DEI mind virus has infecte..."
...,...,...,...,...,...,...,...,...
70803,:kecsvdvI Wed 30 Oct 2024 22:14:55 No.48646562...,THE CAN BARELY GASLIGHT THIS WITHIN THEIR OWN ...,:kecsvdvI,Wed 30 Oct 2024 22:14:55,486465624,No Quote,[],[]
70804,:EF5Mz2zI Wed 30 Oct 2024 22:02:24 No.48646490...,Here's my objective assessment of Stephen Mill...,:EF5Mz2zI,Wed 30 Oct 2024 22:02:24,486464907,No Quote,[],[Here's my objective assessment of Stephen Mil...
70805,:cUMz4T2R Wed 30 Oct 2024 21:55:11 No.48646451...,There are a bunch of reasons. There are many f...,:cUMz4T2R,Wed 30 Oct 2024 21:55:11,486464519,No Quote,[],"[There are a bunch of reasons., There are many..."
70806,:0egXPX4Z Wed 30 Oct 2024 21:34:59 No.48646320...,">But what does this mean?\nDEI, troons, wars, ...",:0egXPX4Z,Wed 30 Oct 2024 21:34:59,486463202,No Quote,[],"[>But what does this mean?, DEI, troons, wars,..."


In [18]:
# sentence detection validation with spacy... ugh, this is too slow
def detect_sentences_spacy(text):
    sentences = []
    for doc in nlp.pipe(text, disable=["ner", "parser"]):  # go dude go
        sentences.append([sent.text.strip() for sent in doc.sents])
    return sentences

df['Sent'] = df['Text'].apply(detect_sentences_spacy)

KeyboardInterrupt: 

In [None]:
# get the differences between manual and spacy sentence split 
def you_a_mismatch(row):
    mismatches = sum(1 for m, s in zip(row['Text_Sent'], row['Sent']) if m != s)
    mismatches += abs(len(row['Text_Sent']) - len(row['Text_Sent']))
    return mismatches 

In [None]:
# filter for len > 512 
# identify 'dei' position 
# trim non-dei sentences

In [86]:
#tokenizing 
df['tokens'] = df['Text'].apply(nltk.word_tokenize)


def cleanlower (m): 
    return [s.lower() for s in m if s not in string.punctuation]

df['tokenslower'] = df['tokens'].apply(cleanlower)

def fix_contractions(tokens):
    contraction_suffixes = ["'t", "'m", "'ve", "'ll", "'d", "'re", "'s", "n't"]
    fixed_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i + 1] in contraction_suffixes:
            if tokens[i + 1] in ["'t", "n't"]:
                fixed_tokens.append(tokens[i])  
            else:
                fixed_tokens.append(tokens[i] + tokens[i + 1])
            i += 1
        else:
            fixed_tokens.append(tokens[i])
        i += 1

    return fixed_tokens

df['Tokens'] = df['tokenslower'].apply(fix_contractions) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['Text'].apply(nltk.word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokenslower'] = df['tokens'].apply(cleanlower)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tokens'] = df['tokenslower'].apply(fix_contractions)


In [90]:
def capture_tokens(df, word, before, after):
    mentions = [
        (word, row['date'], 
         row['Tokens'][max(i - before, 0):i],  
         row['Tokens'][i + 1:i + 1 + after],  
         len(row['Tokens']))  
        for _, row in df.iterrows()  
        for i, token in enumerate(row['Tokens'])  
        if token == word  # Match the specific word
    ]
    return mentions

In [92]:
unigram = capture_tokens(df, 'dei', 1, 1)

In [114]:
# get the dictionaries for before and after 
# this should be integrated with the above function later
def abdict (m): 
    unilist_list = [x[m] for x in unigram]
    uniflat = [x for l in unilist_list for x in l] #flatten the list poggers
    unidict = dict(Counter(uniflat))
    unidict1 = {k:v for k,v in unidict.items() if v >1} # change v depending on what we want to exclude 
    sortubdict = dict(sorted(unidict1.items(), key=lambda item: item[1], reverse = True))
    return sortubdict 

In [None]:
abdict(3)

In [None]:
#lets test the hypothesis - lets see if theres any trend with 'hire' and 'hires' ... so NOUN NOUN 
# is 'dei hire' a bigram or is 'hire' interchangeable 
# which words are (significantly) semantically similar to 'hire' 
# do the above via cosine similarity (i think this already exists) #this is literally BERT lmao
# if we find 'rules' for how to use a word, how predictable are they?

In [None]:
def oneword_datecheck(word):
    return [m for m in unigram if word in m[3]]

oneword_datecheck("hire")

In [126]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [128]:
target_sequence = ['dei', 'hire']
target_df = df[df['Tokens'].apply(lambda x: isinstance(x, list) and target_sequence in [x[i:i+2] for i in range(len(x)-1)])]


In [129]:
def get_all_word_embeddings(tokens):
    sentence = " ".join(tokens)
    inputs = tokenizer(sentence, return_tensors="pt", is_split_into_words=False)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    # get bert embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # get embeddings for each word
    embeddings = outputs.last_hidden_state[0]
    bert_tokens = tokenizer.tokenize(sentence)

    # map tokens to embeddings
    word_embeddings = {}
    for i, token in enumerate(bert_tokens):
        word_embeddings[token] = embeddings[i]

    return word_embeddings

In [130]:
target_df

Unnamed: 0,Identifier,Text,anonid,date,number,quotedby,replyto,tokens,tokenslower,Tokens,CV
1231,:J2jVZ6MH Thu 22 Dec 2022 16:03:58 No.40932578...,>patty flipper malfunctions\n>DEI hire with no...,:J2jVZ6MH,Thu 22 Dec 2022 16:03:58,409325789,No Quote,[],"[>, patty, flipper, malfunctions, >, DEI, hire...","[patty, flipper, malfunctions, dei, hire, with...","[patty, flipper, malfunctions, dei, hire, with...",0.00
1617,:325H1wYP Sun 22 Jan 2023 08:49:08 No.41318015...,he's a mi6 puke using a cia provided proxy for...,:325H1wYP,Sun 22 Jan 2023 08:49:08,413180151,No Quote,[],"[he, 's, a, mi6, puke, using, a, cia, provided...","[he, 's, a, mi6, puke, using, a, cia, provided...","[he's, a, mi6, puke, using, a, cia, provided, ...",0.00
1715,:oJL5epR6 Wed 25 Jan 2023 17:13:59 No.41363654...,seems fake or he's DEI hire or pfizer has phis...,:oJL5epR6,Wed 25 Jan 2023 17:13:59,413636549,No Quote,[],"[seems, fake, or, he, 's, DEI, hire, or, pfize...","[seems, fake, or, he, 's, dei, hire, or, pfize...","[seems, fake, or, he's, dei, hire, or, pfizer,...",0.01
1774,:TtfG5wdH Thu 26 Jan 2023 09:15:07 No.41371597...,"Checked, this was Blizzard's new DEI hire a fe...",:TtfG5wdH,Thu 26 Jan 2023 09:15:07,413715974,No Quote,[],"[Checked, ,, this, was, Blizzard, 's, new, DEI...","[checked, this, was, blizzard, 's, new, dei, h...","[checked, this, was, blizzard's, new, dei, hir...",0.01
1808,:edSUBszY Fri 27 Jan 2023 07:17:39 No.41383307...,all the traders know everything is fake - ever...,:edSUBszY,Fri 27 Jan 2023 07:17:39,413833079,No Quote,[],"[all, the, traders, know, everything, is, fake...","[all, the, traders, know, everything, is, fake...","[all, the, traders, know, everything, is, fake...",0.00
...,...,...,...,...,...,...,...,...,...,...,...
70771,:e+yxuGgu Thu 31 Oct 2024 09:46:56 No.48650579...,Shill me why I should vote for your preferred ...,:e+yxuGgu,Thu 31 Oct 2024 09:46:56,486505795,No Quote,[],"[Shill, me, why, I, should, vote, for, your, p...","[shill, me, why, i, should, vote, for, your, p...","[shill, me, why, i, should, vote, for, your, p...",0.00
70781,:wxUiGawG Thu 31 Oct 2024 07:58:41 No.48649757...,VPN hohol detected\nYou've got a point. Nobody...,:wxUiGawG,Thu 31 Oct 2024 07:58:41,486497574,No Quote,[],"[VPN, hohol, detected, You, 've, got, a, point...","[vpn, hohol, detected, you, 've, got, a, point...","[vpn, hohol, detected, you've, got, a, point, ...",0.00
70784,:OzpwjE85 Thu 31 Oct 2024 07:21:47 No.48649442...,"She's a DEI hire, so no.\n\nPost\nReport",:OzpwjE85,Thu 31 Oct 2024 07:21:47,486494427,No Quote,[],"[She, 's, a, DEI, hire, ,, so, no, ., Post, Re...","[she, 's, a, dei, hire, so, no, post, report]","[she's, a, dei, hire, so, no, post, report]",0.29
70795,:VM6RuOIN Thu 31 Oct 2024 02:42:22 No.48647845...,Being a low IQ DEI hire with an active vocabul...,:VM6RuOIN,Thu 31 Oct 2024 02:42:22,486478451,No Quote,[],"[Being, a, low, IQ, DEI, hire, with, an, activ...","[being, a, low, iq, dei, hire, with, an, activ...","[being, a, low, iq, dei, hire, with, an, activ...",0.00


In [131]:
sentences_hire = [m for m in target_df['Tokens']]

In [133]:
all_embeddings = []

In [134]:
for tokens in sentences_hire:
    embeddings = get_all_word_embeddings(tokens)
    all_embeddings.append(embeddings)

Token indices sequence length is longer than the specified maximum sequence length for this model (939 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The expanded size of the tensor (939) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 939].  Tensor sizes: [1, 512]