In [1]:
import os
import pandas as pd
from pathlib import Path 
import re 
from lingua import Language, LanguageDetectorBuilder

In [2]:
directory = "data"
output_file = 'raw_merged_22_24.txt'
content_list = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            content_list.append(content)

merged_content = '\n\n'.join(content_list)

with open(output_file, 'w', encoding='utf-8') as output:
    output.write(merged_content)

In [69]:
def read_file_as_string(file_path):
    text = Path(file_path).read_text(encoding='utf-8')
    return text

filepath = 'raw_merged_22_24.txt'
text = read_file_as_string(filepath)

In [70]:
def split_text_by_identifier_and_content(text):
    sections = []
    
    pattern_with_id = r'(\S+\s*ID:[\S]+\s+\w{3}\s+\d{2}\s+\w{3}\s+\d{4}\s+\d{2}:\d{2}:\d{2}.*?)\s+ViewReport(.*?)(?=\s+\S+\s*ID:|\Z)'
    match_with_id = re.findall(pattern_with_id, text, flags=re.DOTALL)
    
    if match_with_id:
        for section in match_with_id:
            sections.append((section[0].strip(), section[1].strip()))
    
    pattern_without_id = r'(\w+\s+\w{3}\s+\d{2}\s+\w{3}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+No\.\d+.*?)\s+ViewReport(.*?)(?=\s+\w+\s+\w{3}\s+\d{2}\s+\w{3}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+No\.|\Z)'
    match_without_id = re.findall(pattern_without_id, text, flags=re.DOTALL)
    
    if match_without_id:
        for section in match_without_id:
            sections.append((section[0].strip(), section[1].strip()))
    
    return sections

def extract_and_process_replies(text):
    sections = split_text_by_identifier_and_content(text)
    
    df = pd.DataFrame(sections, columns=['Identifier', 'Text'])
    
    df['id'] = df['Identifier'].apply(lambda text: re.findall(r'ID:([^\s]+)', text)[0] if re.findall(r'ID:([^\s]+)', text) else "No ID")
    
    df['Date'] = df['Identifier'].apply(lambda text: re.findall(r'ID:\S+\s+(\S+\s+\d{2}\s+\S+\s+\d{4}\s+\d{2}:\d{2}:\d{2})', text)[0] if re.findall(r'ID:\S+\s+(\S+\s+\d{2}\s+\S+\s+\d{4}\s+\d{2}:\d{2}:\d{2})', text) else re.findall(r'(\w+\s+\w{3}\s+\d{2}\s+\w{3}\s+\d{4}\s+\d{2}:\d{2}:\d{2})', text)[0] if re.findall(r'(\w+\s+\w{3}\s+\d{2}\s+\w{3}\s+\d{4}\s+\d{2}:\d{2}:\d{2})', text) else None)
    
    df['Thread No'] = df['Identifier'].apply(lambda text: re.findall(r'No\.(\d+)', text)[0] if re.findall(r'No\.(\d+)', text) else None)
    
    df['Quoted By'] = df['Text'].apply(lambda text: re.findall(r'quoted by:\s*>>\d+', text, flags=re.IGNORECASE))
    df['Reply To'] = df['Text'].apply(lambda text: re.findall(r'>>\d+', text))
    
    df['Text'] = df['Text'].apply(lambda text: re.sub(r'quoted by:\s*>>\d+\s*', '', text, flags=re.IGNORECASE).strip())
    df['Text'] = df['Text'].apply(lambda text: re.sub(r'>>\d+\s*', '', text).strip())
    df['Text'] = df['Text'].apply(lambda text: re.sub(r'No\.\d+\s*', '', text).strip())
    
    return df




In [71]:
df = pd.DataFrame(extract_and_process_replies(text))

In [72]:
df

Unnamed: 0,Identifier,Text,id,Date,Thread No,Quoted By,Reply To
0,Anonymous ID:RqQXr/xt Sat 01 Oct 2022 13:20:28...,Tumblr girls were the nerdy outcasts who went ...,RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,[],[]
1,Anonymous ID:+XF1CsQm Sat 01 Oct 2022 12:41:09...,"what is the topic, then?\nYou're like the nigg...",+XF1CsQm,Sat 01 Oct 2022 12:41:09,397853920,[],[>>397853706]
2,Anonymous ID:sdTLdxFE Sat 01 Oct 2022 12:28:40...,He tried to usurp Roman Paganism.\n\n>Since th...,sdTLdxFE,Sat 01 Oct 2022 12:28:40,397852409,[],[>>397851060]
3,Anonymous ID:Ytn2j+6s Sat 01 Oct 2022 10:01:18...,Lots of anons posting ITT about “making it” in...,Ytn2j+6s,Sat 01 Oct 2022 10:01:18,397834023,[],[>>397814529]
4,StreamRift ID:JRUEuylR Sat 01 Oct 2022 00:06:0...,Hear me out:\n>Whitehouse hires fucktards to t...,JRUEuylR,Sat 01 Oct 2022 00:06:02,397778799,[],[]
...,...,...,...,...,...,...,...
73918,Anonymous Tue 29 Oct 2024 23:03:00 No.48636551...,Quoted By:\nI already knew about those DEI cre...,No ID,Anonymous Tue 29 Oct 2024 23:03:00,486365514,"[Quoted By:\n>>486361706, Quoted By:\n>>486355...","[>>486361706, >>486355855, >>486355477, >>4866..."
73919,Anonymous Thu 31 Oct 2024 06:00:13 No.48651772...,We need something like DEI detected for Anime....,No ID,Anonymous Thu 31 Oct 2024 06:00:13,486517727,[Quoted By:\n>>486517721],[>>486517721]
73920,Anonymous Thu 31 Oct 2024 05:27:43 No.48651772...,Quoted By:\nwhat pisses me off is that this wh...,No ID,Anonymous Thu 31 Oct 2024 05:27:43,486517720,"[Quoted By:\n>>486479887, Quoted By:\n>>486484...","[>>486479887, >>486484912, >>486485110, >>4864..."
73921,Anonymous Wed 30 Oct 2024 22:16:47 No.48651761...,>DEI and Woke are fucking dead and buried alre...,No ID,Anonymous Wed 30 Oct 2024 22:16:47,486517619,"[Quoted By:\n>>486517614, Quoted By:\n>>486464...","[>>486517614, >>486464882, >>486464363, >>4864..."


In [73]:
# strip website links from the text
# it means 'image of god' in latin 
sitepattern = r'(?:https?://|www\.)\S+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(?:/[^\s]*)?'
df['Text'] = df['Text'].apply(lambda text: re.sub(sitepattern, '', text).strip())
# strip 'imago dei' comments from the text
df = df[~df['Text'].str.contains('imago', case=False, na=False)]
df = df[~df['Text'].str.contains('amplissimus', case=False, na=False)]
# strip Post Reply
postpattern = r'Post\nReply'
df['Text'] = df['Text'].apply(lambda text: re.sub(postpattern, '', text).strip())

In [74]:
# trying to get rid of this pattern for the millionth time 
metapattern = r'.{5}(sameocrgoogleiqdbsaucenaotrace).*'
df['Text'] = df['Text'].apply(lambda text: re.sub(metapattern, '', text ).strip())
# trying to get rid of this pattern for the millionth time 
metapattern2 = r'.{5}(samegoogleiqdbsaucenaotrace).*'
df['Text'] = df['Text'].apply(lambda text: re.sub(metapattern2, '', text ).strip())

In [75]:
# LATIN EXTERMINATION!!! 
# lingua-py (https://github.com/pemistahl/lingua-py)
languages = [Language.LATIN, Language.ENGLISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

def latin_exterminator(s):
    confidence_value = detector.compute_language_confidence(s, Language.LATIN)
    cv = float(f"{confidence_value:.2f}") 
    if cv >= 0.5:
        return None
    else: 
        return s

#use the latin exterminator
df['Text'] = df['Text'].apply(latin_exterminator)
df = df[df['Text'].notnull()]

# drop duplicates by anon-id (this only refers to the post, not the account)
df = df.drop_duplicates(subset = 'Identifier', keep = 'last')

In [76]:
df

Unnamed: 0,Identifier,Text,id,Date,Thread No,Quoted By,Reply To
18,Anonymous ID:RqQXr/xt Sat 01 Oct 2022 13:20:28...,Tumblr girls were the nerdy outcasts who went ...,RqQXr/xt,Sat 01 Oct 2022 13:20:28,397859125,[],[]
19,Anonymous ID:+XF1CsQm Sat 01 Oct 2022 12:41:09...,"what is the topic, then?\nYou're like the nigg...",+XF1CsQm,Sat 01 Oct 2022 12:41:09,397853920,[],[>>397853706]
21,Anonymous ID:Ytn2j+6s Sat 01 Oct 2022 10:01:18...,Lots of anons posting ITT about “making it” in...,Ytn2j+6s,Sat 01 Oct 2022 10:01:18,397834023,[],[>>397814529]
22,StreamRift ID:JRUEuylR Sat 01 Oct 2022 00:06:0...,Hear me out:\n>Whitehouse hires fucktards to t...,JRUEuylR,Sat 01 Oct 2022 00:06:02,397778799,[],[]
34,Anonymous ID:O8h7xH1H Sun 02 Oct 2022 16:31:37...,These are the three pillars of the US and west...,O8h7xH1H,Sun 02 Oct 2022 16:31:37,398032820,[],[]
...,...,...,...,...,...,...,...
73917,Anonymous Wed 30 Oct 2024 09:41:56 No.48640702...,Quoted By:\nSerious question: what's the plan ...,No ID,Anonymous Wed 30 Oct 2024 09:41:56,486407029,"[Quoted By:\n>>486400819, Quoted By:\n>>486400...","[>>486400819, >>486400723, >>486378202, >>4863..."
73919,Anonymous Thu 31 Oct 2024 06:00:13 No.48651772...,We need something like DEI detected for Anime....,No ID,Anonymous Thu 31 Oct 2024 06:00:13,486517727,[Quoted By:\n>>486517721],[>>486517721]
73920,Anonymous Thu 31 Oct 2024 05:27:43 No.48651772...,Quoted By:\nwhat pisses me off is that this wh...,No ID,Anonymous Thu 31 Oct 2024 05:27:43,486517720,"[Quoted By:\n>>486479887, Quoted By:\n>>486484...","[>>486479887, >>486484912, >>486485110, >>4864..."
73921,Anonymous Wed 30 Oct 2024 22:16:47 No.48651761...,>DEI and Woke are fucking dead and buried alre...,No ID,Anonymous Wed 30 Oct 2024 22:16:47,486517619,"[Quoted By:\n>>486517614, Quoted By:\n>>486464...","[>>486517614, >>486464882, >>486464363, >>4864..."


In [77]:
df.to_csv("nov12_dataset_full.csv")