In [1]:
import pandas as pd
from pathlib import Path 
import re
from collections import Counter
import nltk 
import string 
nltk.download('punkt')
from statistics import median
from statistics import mean
from lingua import Language, LanguageDetectorBuilder
import spacy
nlp = spacy.load("en_core_web_sm")
import torch
from transformers import BertTokenizer, BertModel

# need to downgrade numpy to before 2.0 
# on windows, need to enable long paths : https://www.microfocus.com/documentation/filr/filr-4/filr-desktop/t47bx2ogpfz7.html 
# also need to do through REGEDIT on windows
## if using windows 10, add gpedit.msc this way: https://www.reddit.com/r/AnnoyingTech/comments/ojru3t/adding_gpeditmsc_on_your_windows_home/

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emzou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## read in the data (if downloading from github, concat the two parts)
#df = pd.read_csv("11_6_fulldataset.csv", index_col= 0)
df1 = pd.read_csv("11_6_fulldatapart1.csv")
df2 = pd.read_csv("11_6_fulldatapart2.csv")
df= pd.concat([df1, df2], ignore_index = True)  

In [3]:
### cleaning, processing, tagging
## categorizing quoted by 
def process_quotes(s):
    if "Quoted By" in s: 
        return re.findall(r'>>(\d+)\n', s)
    else:
        modified_string = s  # no modification needed if "Quoted By" is not present
        return "No Quote"
df['quotedby'] = df['Identifier'].apply(process_quotes)
## removing it from the text 
def stripper (s): 
    if 'Quoted By' in s:
        cleaned_string = re.sub(r'Quoted By:|>>\d+\n', '', s)
        return cleaned_string.strip()
    else: 
        return s
df ['Text'] = df['Text'].apply(stripper)
## getting the reply-to out 
df['replyto'] = df['Text'].apply(lambda text: re.findall(r'>>(\d+)', text))
df['Text'] = df['Text'].apply(lambda text: re.sub(r'>>\d+\s*', '', text).strip())
# strip website links from the text
# it means 'image of god' in latin 
sitepattern = r'(?:https?://|www\.)\S+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(?:/[^\s]*)?'
df['Text'] = df['Text'].apply(lambda text: re.sub(sitepattern, '', text).strip())
# strip 'imago dei' comments from the text
df = df[~df['Text'].str.contains('imago', case=False, na=False)]
df = df[~df['Text'].str.contains('amplissimus', case=False, na=False)]

In [4]:
# LATIN EXTERMINATION!!! 
# lingua-py (https://github.com/pemistahl/lingua-py)
languages = [Language.LATIN, Language.ENGLISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

def latin_exterminator(s):
    confidence_value = detector.compute_language_confidence(s, Language.LATIN)
    cv = float(f"{confidence_value:.2f}") 
    if cv >= 0.5:
        return None
    else: 
        return s

#use the latin exterminator
df['Text'] = df['Text'].apply(latin_exterminator)
df = df[df['Text'].notnull()]

# drop duplicates by anon-id (this only refers to the post, not the account)
df = df.drop_duplicates(subset = 'anonid', keep = 'last')

In [5]:
## sentence detection => still work in progress 
### regex/ naive method: 
def sentsplit (text):
    pattern = r'[^.!?]*[.!?]'
    sentences = [sentence.strip() for sentence in re.findall(pattern, text)]
    return sentences 
df['Text_Sent'] = df['Text'].apply(sentsplit)
### spacy method: 
nlp = spacy.load("en_core_web_sm", disable = ["ner", "tagger"])
def detect_sentences_spacy_pipe(text): 
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]
df ['Sent'] = df['Text'].apply(detect_sentences_spacy_pipe)
### validate across both methods
def you_a_mismatch(row):
    mismatches = sum(1 for m, s in zip(row['Text_Sent'], row['Sent']) if m != s)
    mismatches += abs(len(row['Text_Sent']) - len(row['Text_Sent']))
    return mismatches 
df['Mistmatch'] = df.apply(lambda r: you_a_mismatch(r), axis = 1)



In [6]:
# refine mismatches later. let's see if we can see anything using this method 
fdf = df[df['Mistmatch'] == 0]
# this gives us 20k results (a third of the data needs to be parsed more carefully)

In [7]:
# tokenize 
def tokenize_list (listss): 
    list1 = [nltk.word_tokenize(m) for m in listss] # this gives us a list of tokenized lists
    def lower_case (m):
        return [s.lower() for s in m if s not in string.punctuation]
    list2 = [lower_case(m) for m in list1] 
    def fix_contractions(tokens):
        contraction_suffixes = ["'t", "'m", "'ve", "'ll", "'d", "'re", "'s", "n't"]
        fixed_tokens = []
        i = 0
        while i < len(tokens):
            if i < len(tokens) - 1 and tokens[i + 1] in contraction_suffixes:
                if tokens[i + 1] in ["'t", "n't"]:
                    fixed_tokens.append(tokens[i])  
                else:
                    fixed_tokens.append(tokens[i] + tokens[i + 1])
                i += 1
            else:
                fixed_tokens.append(tokens[i])
            i += 1
        return fixed_tokens
    list3 = [fix_contractions(m) for m in list2]
    return list3
fdf['Tokens'] = fdf['Sent'].apply(tokenize_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fdf['Tokens'] = fdf['Sent'].apply(tokenize_list)


In [8]:
#exploding the df to make each sentence its own entry 
# may need to see if there any trends in this ... 

fdf['Tokens_E'] = fdf['Tokens'].apply(lambda x: [m for m in x])
fdf_E = fdf.explode('Tokens_E').reset_index(drop = True)
da = fdf_E[fdf_E['Tokens_E'].apply(lambda x: "dei" in x)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fdf['Tokens_E'] = fdf['Tokens'].apply(lambda x: [m for m in x])


In [9]:
# lets see what words are immediately before and after dei... possibly identify common sentence patterns and bigrams
def capture_tokens(df, word, before, after):
    mentions = [
        (word, row['date'], 
         row['Tokens_E'][max(i - before, 0):i],  
         row['Tokens_E'][i + 1:i + 1 + after],  
         len(row['Tokens_E']))  
        for _, row in df.iterrows()  
        for i, token in enumerate(row['Tokens_E'])  
        if token == word  # Match the specific word
    ]
    return mentions

In [10]:
unigram = capture_tokens(da, 'dei', 1, 1)

In [12]:
unigram[:30]

[('dei', 'Sat 01 Oct 2022 13:20:28', ['and'], ['roles'], 20),
 ('dei', 'Sat 01 Oct 2022 12:41:09', ['violence'], ['good'], 31),
 ('dei', 'Sun 02 Oct 2022 11:32:05', ['the'], ['mind'], 11),
 ('dei', 'Sun 02 Oct 2022 10:40:47', ['for'], [], 32),
 ('dei', 'Sun 02 Oct 2022 08:20:57', ['opus'], [], 4),
 ('dei', 'Mon 03 Oct 2022 19:10:43', ['a'], ['asset'], 25),
 ('dei', 'Mon 03 Oct 2022 15:49:50', [], ['is'], 7),
 ('dei', 'Mon 03 Oct 2022 14:58:39', ['``'], ["''"], 22),
 ('dei', 'Mon 03 Oct 2022 12:20:27', ['providing'], ['training'], 26),
 ('dei', 'Mon 03 Oct 2022 12:08:39', ['se'], ['bem'], 44),
 ('dei', 'Mon 03 Oct 2022 08:06:01', ['entire'], ['industry'], 17),
 ('dei', 'Mon 03 Oct 2022 00:04:06', ['of'], ['boxcheckers'], 23),
 ('dei', 'Tue 04 Oct 2022 16:44:37', ['of'], ['at'], 25),
 ('dei',
  'Tue 04 Oct 2022 16:44:37',
  ['sameocrgoogleiqdbsaucenaotrace'],
  ['185kib'],
  16),
 ('dei', 'Tue 04 Oct 2022 14:55:46', ['female'], ['officer'], 22),
 ('dei', 'Tue 04 Oct 2022 13:23:34', ['fem

In [13]:
# get the dictionaries for before and after 
# this should be integrated with the above function later
def abdict (m): 
    unilist_list = [x[m] for x in unigram]
    uniflat = [x for l in unilist_list for x in l] #flatten the list poggers
    unidict = dict(Counter(uniflat))
    unidict1 = {k:v for k,v in unidict.items() if v >1} # change v depending on what we want to exclude 
    sortubdict = dict(sorted(unidict1.items(), key=lambda item: item[1], reverse = True))
    return sortubdict 

In [None]:
#after words
abdict(3)

{'and': 1404,
 'hires': 1310,
 'hire': 1140,
 'is': 1102,
 'post': 533,
 'shit': 502,
 'bullshit': 377,
 'policies': 297,
 'in': 265,
 'initiatives': 252,
 'hiring': 246,
 'niggers': 201,
 'has': 183,
 'to': 171,
 'or': 157,
 'nigger': 141,
 'view': 139,
 'was': 129,
 "''": 127,
 'esg': 122,
 'training': 121,
 'for': 113,
 'programs': 112,
 'diversity': 104,
 'will': 99,
 'quotas': 97,
 'stuff': 90,
 'requirements': 89,
 'candidate': 78,
 'are': 76,
 'nonsense': 75,
 'but': 74,
 'so': 72,
 'policy': 71,
 'money': 71,
 'at': 70,
 'the': 70,
 'retards': 70,
 'you': 69,
 'as': 68,
 'points': 66,
 'woke': 64,
 'on': 63,
 'jobs': 62,
 'people': 60,
 'they': 59,
 'departments': 58,
 'officer': 56,
 'agenda': 56,
 'garbage': 56,
 'affirmative': 53,
 'because': 52,
 'means': 52,
 'team': 51,
 'department': 50,
 'i': 50,
 'crap': 49,
 'job': 48,
 'hired': 48,
 'that': 47,
 'vp': 47,
 'strikes': 47,
 'does': 46,
 'faggots': 46,
 'which': 46,
 'now': 46,
 'did': 46,
 'military': 42,
 'boss': 42,


In [None]:
#before words 
abdict(2)

{'the': 2032,
 'a': 1199,
 'of': 1154,
 'and': 1119,
 'with': 591,
 'to': 541,
 'for': 443,
 'their': 365,
 'by': 344,
 'that': 238,
 'about': 237,
 'on': 225,
 'is': 212,
 'opus': 202,
 'some': 200,
 'this': 190,
 'more': 153,
 'like': 146,
 '``': 145,
 'in': 140,
 'have': 139,
 'esg': 137,
 'your': 136,
 'are': 130,
 'or': 127,
 'all': 123,
 'from': 118,
 'no': 108,
 'woke': 98,
 'these': 98,
 'against': 96,
 'pushing': 95,
 'because': 87,
 'as': 83,
 'not': 81,
 'fucking': 77,
 'nigger': 66,
 'our': 65,
 'but': 64,
 'push': 62,
 'those': 62,
 'incompetent': 62,
 "it's": 61,
 'what': 60,
 'new': 60,
 'retarded': 59,
 'you': 57,
 'other': 55,
 'be': 54,
 'it': 52,
 'through': 52,
 'before': 50,
 'how': 48,
 'muh': 48,
 'another': 47,
 'just': 47,
 'into': 46,
 'his': 45,
 'get': 45,
 'has': 44,
 'think': 43,
 'without': 43,
 'vox': 42,
 'when': 42,
 'called': 42,
 'why': 40,
 'its': 40,
 'inclusion': 39,
 'than': 38,
 'support': 38,
 'forced': 38,
 'her': 38,
 'do': 37,
 'download': 3

In [17]:
candidate_words = list(abdict(3).keys()) 
### the current thing i'm doing is testing contextual embeddings using BERT
### unfortunately it keeps breaking, but i'm still trying 
## meanwhile: we can do this more manually by using part of speech tagging, as well as using common sentence structures 
## ex: NOUN <is> [DEFINITE ARTICLE] <dei> 