In [2]:
import pandas as pd
from pathlib import Path 
import re
from collections import Counter
import nltk 
import string 
nltk.download('punkt')
from statistics import median
from statistics import mean
from lingua import Language, LanguageDetectorBuilder
import spacy
nlp = spacy.load("en_core_web_sm")
from transformers import pipeline
from spacy.pipeline import Sentencizer
from nltk.tokenize import sent_tokenize

# need to downgrade numpy to before 2.0 
# on windows, need to enable long paths : https://www.microfocus.com/documentation/filr/filr-4/filr-desktop/t47bx2ogpfz7.html 
# also need to do through REGEDIT on windows
## if using windows 10, add gpedit.msc this way: https://www.reddit.com/r/AnnoyingTech/comments/ojru3t/adding_gpeditmsc_on_your_windows_home/

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emzou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [40]:
## read in the data (if downloading from github, concat the two parts)
#df = pd.read_csv("11_6_fulldataset.csv", index_col= 0)
df1 = pd.read_csv("11_6_fulldatapart1.csv")
df2 = pd.read_csv("11_6_fulldatapart2.csv")
df= pd.concat([df1, df2], ignore_index = True)  

In [41]:
df['Text'] = df['Text'].str.lower()

In [42]:
### cleaning, processing, tagging
## categorizing quoted by 
def process_quotes(s):
    if "quoted by:" in s: 
        return re.findall(r'>>(\d+)\n', s)
    else:
        modified_string = s  # no modification needed if "Quoted By" is not present
        return "No Quote"
df['quotedby'] = df['Identifier'].apply(process_quotes)
## removing it from the text 
def stripper (s): 
    if 'Quoted By' in s:
        cleaned_string = re.sub(r'Quoted By:|>>\d+\n', '', s)
        return cleaned_string.strip()
    else: 
        return s
df ['Text'] = df['Text'].apply(stripper)
## getting the reply-to out 
df['replyto'] = df['Text'].apply(lambda text: re.findall(r'>>(\d+)', text))
df['Text'] = df['Text'].apply(lambda text: re.sub(r'>>\d+\s*', '', text).strip())
# strip website links from the text
# it means 'image of god' in latin 
sitepattern = r'(?:https?://|www\.)\S+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(?:/[^\s]*)?'
df['Text'] = df['Text'].apply(lambda text: re.sub(sitepattern, '', text).strip())
# strip 'imago dei' comments from the text
df = df[~df['Text'].str.contains('imago', case=False, na=False)]
df = df[~df['Text'].str.contains('amplissimus', case=False, na=False)]
# strip Post Reply
postpattern = r'Post\nReply'
df['Text'] = df['Text'].apply(lambda text: re.sub(postpattern, '', text).strip())

In [43]:
# trying to get rid of this pattern for the millionth time 
metapattern = r'.{5}(sameocrgoogleiqdbsaucenaotrace).*'
df['Text'] = df['Text'].apply(lambda text: re.sub(metapattern, '', text ).strip())

In [None]:
# trying to get rid of this pattern for the millionth time 
metapattern2 = r'.{5}(samegoogleiqdbsaucenaotrace).*'
df['Text'] = df['Text'].apply(lambda text: re.sub(metapattern, '', text ).strip())

In [44]:
# LATIN EXTERMINATION!!! 
# lingua-py (https://github.com/pemistahl/lingua-py)
languages = [Language.LATIN, Language.ENGLISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

def latin_exterminator(s):
    confidence_value = detector.compute_language_confidence(s, Language.LATIN)
    cv = float(f"{confidence_value:.2f}") 
    if cv >= 0.5:
        return None
    else: 
        return s

#use the latin exterminator
df['Text'] = df['Text'].apply(latin_exterminator)
df = df[df['Text'].notnull()]

# drop duplicates by anon-id (this only refers to the post, not the account)
df = df.drop_duplicates(subset = 'anonid', keep = 'last')

In [45]:
## PAUSE: we're doing sentence boundary testing now using pretty sophisticated methods, which take a while. let's use a smaller dataset first to see if it works. 
df 

Unnamed: 0,Identifier,Text,anonid,date,number,quotedby,replyto
16,:RqQXr/xt Sat 01 Oct 2022 13:20:28 No.397859125,tumblr girls were the nerdy outcasts who went ...,:RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,No Quote,[]
17,:+XF1CsQm Sat 01 Oct 2022 12:41:09 No.397853920,"what is the topic, then?\nyou're like the nigg...",:+XF1CsQm,Sat 01 Oct 2022 12:41:09,397853920,No Quote,[397853706]
19,:Ytn2j+6s Sat 01 Oct 2022 10:01:18 No.397834023,lots of anons posting itt about “making it” in...,:Ytn2j+6s,Sat 01 Oct 2022 10:01:18,397834023,No Quote,[397814529]
31,:O8h7xH1H Sun 02 Oct 2022 16:31:37 No.39803282...,these are the three pillars of the us and west...,:O8h7xH1H,Sun 02 Oct 2022 16:31:37,398032820,No Quote,[]
33,:JsLtzO4W Sun 02 Oct 2022 11:32:05 No.397991514,"haha, indeed. the dei mind virus has infected ...",:JsLtzO4W,Sun 02 Oct 2022 11:32:05,397991514,No Quote,[397984711]
...,...,...,...,...,...,...,...
70803,:kecsvdvI Wed 30 Oct 2024 22:14:55 No.48646562...,quoted by:\nthe can barely gaslight this withi...,:kecsvdvI,Wed 30 Oct 2024 22:14:55,486465624,No Quote,[486464882]
70804,:EF5Mz2zI Wed 30 Oct 2024 22:02:24 No.48646490...,quoted by:\nhere's my objective assessment of ...,:EF5Mz2zI,Wed 30 Oct 2024 22:02:24,486464907,No Quote,[486464363]
70805,:cUMz4T2R Wed 30 Oct 2024 21:55:11 No.48646451...,quoted by:\nthere are a bunch of reasons. ther...,:cUMz4T2R,Wed 30 Oct 2024 21:55:11,486464519,No Quote,"[486462545, 486517619, 486517578]"
70806,:0egXPX4Z Wed 30 Oct 2024 21:34:59 No.48646320...,"quoted by:\n>but what does this mean?\ndei, tr...",:0egXPX4Z,Wed 30 Oct 2024 21:34:59,486463202,No Quote,[486462545]


In [17]:
# DJ KHALED WE DA BEST 
df = pd.read_csv("nov12_dataset_full.csv")

In [18]:
df

Unnamed: 0.1,Unnamed: 0,Identifier,Text,id,Date,Thread No,Quoted By,Reply To
0,18,Anonymous ID:RqQXr/xt Sat 01 Oct 2022 13:20:28...,Tumblr girls were the nerdy outcasts who went ...,RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,[],[]
1,19,Anonymous ID:+XF1CsQm Sat 01 Oct 2022 12:41:09...,"what is the topic, then?\nYou're like the nigg...",+XF1CsQm,Sat 01 Oct 2022 12:41:09,397853920,[],['>>397853706']
2,21,Anonymous ID:Ytn2j+6s Sat 01 Oct 2022 10:01:18...,Lots of anons posting ITT about “making it” in...,Ytn2j+6s,Sat 01 Oct 2022 10:01:18,397834023,[],['>>397814529']
3,22,StreamRift ID:JRUEuylR Sat 01 Oct 2022 00:06:0...,Hear me out:\n>Whitehouse hires fucktards to t...,JRUEuylR,Sat 01 Oct 2022 00:06:02,397778799,[],[]
4,34,Anonymous ID:O8h7xH1H Sun 02 Oct 2022 16:31:37...,These are the three pillars of the US and west...,O8h7xH1H,Sun 02 Oct 2022 16:31:37,398032820,[],[]
...,...,...,...,...,...,...,...,...
35814,73917,Anonymous Wed 30 Oct 2024 09:41:56 No.48640702...,Quoted By:\nSerious question: what's the plan ...,No ID,Anonymous Wed 30 Oct 2024 09:41:56,486407029,"['Quoted By:\n>>486400819', 'Quoted By:\n>>486...","['>>486400819', '>>486400723', '>>486378202', ..."
35815,73919,Anonymous Thu 31 Oct 2024 06:00:13 No.48651772...,We need something like DEI detected for Anime....,No ID,Anonymous Thu 31 Oct 2024 06:00:13,486517727,['Quoted By:\n>>486517721'],['>>486517721']
35816,73920,Anonymous Thu 31 Oct 2024 05:27:43 No.48651772...,Quoted By:\nwhat pisses me off is that this wh...,No ID,Anonymous Thu 31 Oct 2024 05:27:43,486517720,"['Quoted By:\n>>486479887', 'Quoted By:\n>>486...","['>>486479887', '>>486484912', '>>486485110', ..."
35817,73921,Anonymous Wed 30 Oct 2024 22:16:47 No.48651761...,>DEI and Woke are fucking dead and buried alre...,No ID,Anonymous Wed 30 Oct 2024 22:16:47,486517619,"['Quoted By:\n>>486517614', 'Quoted By:\n>>486...","['>>486517614', '>>486464882', '>>486464363', ..."


In [19]:
### spacy method: 
nlp = spacy.load("en_core_web_sm", disable = ["ner", "tagger"])
def detect_sentences_spacy_pipe(text): 
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Function to get sentence lengths using spaCy method
def spacy_sentence_lengths(text):
    sentences = detect_sentences_spacy_pipe(text)
    return [len(sentence) for sentence in sentences]

df['Spacy_Sentences'] = df['Text'].apply(detect_sentences_spacy_pipe)
df['Spacy_Sentence_Lengths'] = df['Text'].apply(spacy_sentence_lengths)



In [20]:
# Sentence detection with NLTK's Punkt
def nltk_sentsplit(text):
    sentences = sent_tokenize(text)
    return [sentence.strip() for sentence in sentences]

df['NLTK_Sentences'] = df['Text'].apply(nltk_sentsplit)

# Get sentence lengths with NLTK's Punkt
def nltk_sentsplit_lengths(text):
    sentences = nltk_sentsplit(text)
    return [len(sentence) for sentence in sentences]

df['NLTK_Sentence_Lengths'] = df['Text'].apply(nltk_sentsplit_lengths)

In [21]:
def you_a_mismatch(row):
    # Initialize a list to store sentence length tuples across methods
    sentence_lengths = []
    
    # Use zip to pair sentences from Spacy_Sentences and NLTK_Sentences by length
    for sent1, sent2 in zip(row['Spacy_Sentences'], row['NLTK_Sentences']):
        sentence_lengths.append((len(sent1), len(sent2)))
    
    # Add any extra sentences from Spacy method if longer than NLTK
    sentence_lengths += [
        (len(sent1), 0) 
        for sent1 in row['Spacy_Sentences'][len(row['NLTK_Sentences']):]
    ]
    
    # Add extra sentences from NLTK method if longer than Spacy
    sentence_lengths += [
        (0, len(sent2)) 
        for sent2 in row['NLTK_Sentences'][len(row['Spacy_Sentences']):]
    ]
    
    return sentence_lengths

# Apply this function to the DataFrame to calculate mismatches
df['Sentence_Lengths_Mismatch'] = df.apply(you_a_mismatch, axis=1)

# Display the resulting mismatches column
print(df[['Text', 'Sentence_Lengths_Mismatch']])


                                                    Text  \
0      Tumblr girls were the nerdy outcasts who went ...   
1      what is the topic, then?\nYou're like the nigg...   
2      Lots of anons posting ITT about “making it” in...   
3      Hear me out:\n>Whitehouse hires fucktards to t...   
4      These are the three pillars of the US and west...   
...                                                  ...   
35814  Quoted By:\nSerious question: what's the plan ...   
35815  We need something like DEI detected for Anime....   
35816  Quoted By:\nwhat pisses me off is that this wh...   
35817  >DEI and Woke are fucking dead and buried alre...   
35818  Fuck off, we love our anime as it is.\n\nDEI a...   

                               Sentence_Lengths_Mismatch  
0                   [(112, 112), (218, 218), (103, 103)]  
1              [(24, 24), (128, 481), (63, 0), (290, 0)]  
2      [(90, 90), (85, 85), (87, 87), (15, 15), (58, ...  
3      [(12, 84), (73, 167), (166, 114), (1

In [22]:
def count_mismatches(row):
    # Initialize mismatch counter
    mismatch_count = 0
    
    # Compare sentences between Spacy and NLTK methods
    for sent1, sent2 in zip(row['Spacy_Sentences'], row['NLTK_Sentences']):
        if sent1 != sent2:
            mismatch_count += 1
    
    # Add mismatches for any extra sentences in either method
    mismatch_count += abs(len(row['Spacy_Sentences']) - len(row['NLTK_Sentences']))
    
    return mismatch_count

# Apply the function to create a column with the total number of mismatches
df['Total_Mismatches'] = df.apply(count_mismatches, axis=1)

# Display the resulting mismatches column
print(df[['Text', 'Total_Mismatches']])


                                                    Text  Total_Mismatches
0      Tumblr girls were the nerdy outcasts who went ...                 0
1      what is the topic, then?\nYou're like the nigg...                 3
2      Lots of anons posting ITT about “making it” in...                 5
3      Hear me out:\n>Whitehouse hires fucktards to t...                11
4      These are the three pillars of the US and west...                 0
...                                                  ...               ...
35814  Quoted By:\nSerious question: what's the plan ...               224
35815  We need something like DEI detected for Anime....                 0
35816  Quoted By:\nwhat pisses me off is that this wh...                77
35817  >DEI and Woke are fucking dead and buried alre...                22
35818  Fuck off, we love our anime as it is.\n\nDEI a...                17

[35819 rows x 2 columns]


In [9]:
df.to_csv("11_12brains.csv")

In [23]:
# testing performance

fdf = df[df['Total_Mismatches'] == 0]

In [24]:
fdf

Unnamed: 0.1,Unnamed: 0,Identifier,Text,id,Date,Thread No,Quoted By,Reply To,Spacy_Sentences,Spacy_Sentence_Lengths,NLTK_Sentences,NLTK_Sentence_Lengths,Sentence_Lengths_Mismatch,Total_Mismatches
0,18,Anonymous ID:RqQXr/xt Sat 01 Oct 2022 13:20:28...,Tumblr girls were the nerdy outcasts who went ...,RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,[],[],[Tumblr girls were the nerdy outcasts who went...,"[112, 218, 103]",[Tumblr girls were the nerdy outcasts who went...,"[112, 218, 103]","[(112, 112), (218, 218), (103, 103)]",0
4,34,Anonymous ID:O8h7xH1H Sun 02 Oct 2022 16:31:37...,These are the three pillars of the US and west...,O8h7xH1H,Sun 02 Oct 2022 16:31:37,398032820,[],[],[These are the three pillars of the US and wes...,"[57, 55, 25, 70, 54]",[These are the three pillars of the US and wes...,"[57, 55, 25, 70, 54]","[(57, 57), (55, 55), (25, 25), (70, 70), (54, ...",0
5,36,Anonymous ID:JsLtzO4W Sun 02 Oct 2022 11:32:05...,"Haha, indeed. The DEI mind virus has infected ...",JsLtzO4W,Sun 02 Oct 2022 11:32:05,397991514,[],['>>397984711'],"[Haha, indeed., The DEI mind virus has infecte...","[13, 68, 89, 139]","[Haha, indeed., The DEI mind virus has infecte...","[13, 68, 89, 139]","[(13, 13), (68, 68), (89, 89), (139, 139)]",0
6,37,Anonymous ID:H47Qry4L Sun 02 Oct 2022 10:40:47...,"This is correct. Like anywhere else, the US ar...",H47Qry4L,Sun 02 Oct 2022 10:40:47,397984711,['Quoted By: >>397991514'],"['>>397991514', '>>397983953']","[This is correct., Like anywhere else, the US ...","[16, 89, 159, 113]","[This is correct., Like anywhere else, the US ...","[16, 89, 159, 113]","[(16, 16), (89, 89), (159, 159), (113, 113)]",0
7,38,Anonymous ID:9ILswUs7 Sun 02 Oct 2022 08:20:57...,is he Opus dei? Our president is as well and g...,9ILswUs7,Sun 02 Oct 2022 08:20:57,397967825,[],['>>397967177'],"[is he Opus dei?, Our president is as well and...","[15, 52, 65]","[is he Opus dei?, Our president is as well and...","[15, 52, 65]","[(15, 15), (52, 52), (65, 65)]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35804,73894,Anonymous Fri 25 Oct 2024 07:32:25 No.48589611...,"My company did this, it had something to do wi...",No ID,Anonymous Fri 25 Oct 2024 07:32:25,485896112,['Quoted By:\n>>692835878'],['>>692835878'],"[My company did this, it had something to do w...","[123, 96]","[My company did this, it had something to do w...","[123, 96]","[(123, 123), (96, 96)]",0
35806,73896,Anonymous Fri 25 Oct 2024 07:25:59 No.48589610...,>If the numbers don't dwindle that will show i...,No ID,Anonymous Fri 25 Oct 2024 07:25:59,485896104,['Quoted By:\n>>485896081'],['>>485896081'],[>If the numbers don't dwindle that will show ...,"[141, 92]",[>If the numbers don't dwindle that will show ...,"[141, 92]","[(141, 141), (92, 92)]",0
35808,73898,Anonymous Fri 25 Oct 2024 07:06:45 No.48589608...,Anti-woke bros what happened??? I thought us c...,No ID,Anonymous Fri 25 Oct 2024 07:06:45,485896083,['Quoted By: >>485896123'],"['>>485896123', '>>485896077']","[Anti-woke bros what happened???, I thought us...","[31, 79, 59, 113]","[Anti-woke bros what happened???, I thought us...","[31, 79, 59, 113]","[(31, 31), (79, 79), (59, 59), (113, 113)]",0
35809,73899,Anonymous Fri 25 Oct 2024 07:04:34 No.48589608...,Quoted By:\n>DoD propaganda machine going all ...,No ID,Anonymous Fri 25 Oct 2024 07:04:34,485896082,[],[],[Quoted By:\n>DoD propaganda machine going all...,[123],[Quoted By:\n>DoD propaganda machine going all...,[123],"[(123, 123)]",0


In [25]:
fdf.to_csv("nov12hello.csv")