In [1]:
import json
import pandas as pd
import re
from bs4 import BeautifulSoup
from collections import defaultdict
import sys
import os

In [2]:
# Path to the folder containing CSV files
input_dir = r"C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\all_csvs"
output_file = r"C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\merged_sorted2.csv"

# Collect all CSV files in the directory
csv_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.csv')]

# Read and concatenate all CSV files
all_data = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# Ensure 'Date' column is parsed as datetime
all_data['Date'] = pd.to_datetime(all_data['Date'], errors='coerce')

# Drop rows where 'Date' couldn't be parsed
all_data = all_data.dropna(subset=['Date'])

# Sort by the 'Date' column
all_data = all_data.sort_values(by='Date')

# Save the merged and sorted data to a single CSV file
all_data.to_csv(output_file, index=False, encoding="utf-8" )

print(f"All CSV files merged and sorted by date. Saved to: {output_file}")


All CSV files merged and sorted by date. Saved to: C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\merged_sorted2.csv


In [56]:
input_file = r"C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\merged_sorted.csv"
output_dir = r"C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\split_csvs"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load the CSV and ensure Date is parsed correctly
df = pd.read_csv(input_file)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['Date'])

# Sort the DataFrame by Date to ensure order
df = df.sort_values(by='Date').reset_index(drop=True)

# Split into 6 equal parts
total_rows = len(df)
chunk_size = total_rows // 6

# Store the splits
for i in range(6):
    # Define start and end of the main chunk
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if i < 5 else total_rows  # Ensure the last chunk includes all remaining rows

    # Add 3 weeks of overlap (if applicable)
    start_date = df.iloc[start_idx]['Date'] - pd.Timedelta(weeks=3)
    end_date = df.iloc[end_idx - 1]['Date'] + pd.Timedelta(weeks=3)

    # Filter rows to include overlap
    overlap_chunk = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

    # Save each split
    output_file = os.path.join(output_dir, f"split_{i+1}.csv")
    overlap_chunk.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved split {i+1} to {output_file}")

Saved split 1 to C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\split_csvs\split_1.csv
Saved split 2 to C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\split_csvs\split_2.csv
Saved split 3 to C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\split_csvs\split_3.csv
Saved split 4 to C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\split_csvs\split_4.csv
Saved split 5 to C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\split_csvs\split_5.csv
Saved split 6 to C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\split_csvs\split_6.csv


In [5]:
fdf = pd.read_csv("merged_sorted.csv")

In [None]:
input_dir = r"C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\all"
output_dir = r"C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\processed_csvs"
merged_csv_path = r"C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\merged_processed.csv"

os.makedirs(output_dir, exist_ok=True)

def return_json(input_string):
    soup = BeautifulSoup(input_string, 'html.parser')
    posts_data = []
    for post_wrapper in soup.find_all("div", class_="post_wrapper"):
        post_data = {
            "post_data": {
                "author": post_wrapper.find("span", class_="post_author").text.strip() if post_wrapper.find("span", class_="post_author") else "",
                "tripcode": post_wrapper.find("span", class_="post_tripcode").text.strip() if post_wrapper.find("span", class_="post_tripcode") else "",
                "poster_hash": post_wrapper.find("span", class_="poster_hash").text.strip() if post_wrapper.find("span", class_="poster_hash") else "",
                "datetime": post_wrapper.find("time")["datetime"] if post_wrapper.find("time") else "",
                "time_text": post_wrapper.find("time").text.strip() if post_wrapper.find("time") else "",
                "post_id": post_wrapper.find("a", {"data-function": "quote"})["data-post"] if post_wrapper.find("a", {"data-function": "quote"}) else "",
                "post_link": post_wrapper.find("a", {"data-function": "quote"})["href"] if post_wrapper.find("a", {"data-function": "quote"}) else ""
            },
            "controls": {
                "controls_links": [
                    control.get("href", "#") for control in post_wrapper.select(".post_controls a")
                ]
            },
            "backlink_list": {
                "quoted_by": [
                    backlink["data-post"] for backlink in post_wrapper.select(".post_backlink")
                ]
            },
            "text_content": {
                "text": post_wrapper.find("div", class_="text").get_text(separator="\n").strip() if post_wrapper.find("div", class_="text") else "",
                "greentext_links": [
                    link["href"] for link in post_wrapper.find_all("a", class_="backlink") if link
                ]
            }
        }
        posts_data.append(post_data)
    json_output = json.dumps(posts_data, indent=4)
    return json_output

dataframes = []

for filename in os.listdir(input_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        json_data = json.loads(return_json(text))
        df = pd.json_normalize(json_data)
        csv_filename = os.path.splitext(filename)[0] + ".csv"
        csv_output_path = os.path.join(output_dir, csv_filename)
        df.to_csv(csv_output_path, index=False)
        dataframes.append(df)

merged_df = pd.concat(dataframes, ignore_index=True)
merged_df.to_csv(merged_csv_path, index=False)


In [2]:
df = pd.read_csv("merged_processed.csv")

In [None]:
da['post_link'] = da['post_link'].apply(str)

0        https://archive.4plebs.org/pol/thread/39773330...
1        https://archive.4plebs.org/pol/thread/39766019...
2        https://archive.4plebs.org/pol/thread/39765823...
3        https://archive.4plebs.org/pol/thread/39761295...
4        https://archive.4plebs.org/pol/thread/39761295...
                               ...                        
46797    https://archive.4plebs.org/pol/thread/47318974...
46798    https://archive.4plebs.org/pol/thread/47317472...
46799    https://archive.4plebs.org/pol/thread/47317472...
46800    https://archive.4plebs.org/pol/thread/47318612...
46801                                                  NaN
Name: post_link, Length: 46802, dtype: object

In [7]:
da = df[['post_data.poster_hash', 
        'post_data.datetime', 
        'post_data.post_id',
        'post_data.post_link', 
        #'backlink_list.quoted_by', 
        'text_content.text']].rename(columns = 
        {'post_data.poster_hash': 'poster_ID', 
        'post_data.datetime': 'date', 
        'post_data.post_id' :'post_id', 
        'post_data.post_link': 'post_link', 
        #'backlink_list.quoted_by': 'quoted_by', 
        'text_content.text': 'text'
         })

def threadno_get(url): 
    thread_number = re.search(r'/thread/(\d+)', url)
    if thread_number:
        return thread_number.group(1)
    else: 
        return "No Thread Number Found... Uh Oh..."
da = da.astype(str)
da['Thread_No'] = da['post_link'].apply(threadno_get)
da['Reply_To'] = da['text'].apply(lambda text: re.findall(r'>>(\d+)', text))
da['text'] = da['text'].apply(lambda text: re.sub(r'>>\d+\s*', '', text).strip())

In [6]:
dei_mention_list = list(set(open(os.path.join("processed", f), encoding="utf-8").read() for f in os.listdir("processed") if f.endswith(".txt")))


In [24]:
lines = [m.split("\n") for m in dei_mention_list]

In [None]:
lanes = [a for b in lines for a in b]

In [83]:
lanes

['8045713',
 '439441050',
 '452578523',
 '433832069',
 '452411868',
 '453296560',
 '442952112',
 '434097257',
 '445834607',
 '442907755',
 '450502877',
 '435323874',
 '437311810',
 '435787164',
 '448122796',
 '448004799',
 '448757489',
 '449047873',
 '454107826',
 '445258431',
 '452430461',
 '452422749',
 '449248170',
 '452508934',
 '450797339',
 '436185193',
 '453143773',
 '449425660',
 '453414810',
 '452383475',
 '439442456',
 '451339439',
 '439606909',
 '451403331',
 '453737632',
 '437954735',
 '448720908',
 '453181998',
 '442242287',
 '453331793',
 '454196102',
 '453916777',
 '436474143',
 '441710204',
 '442428965',
 '452654852',
 '448844509',
 '439879707',
 '453105423',
 '440151724',
 '448972384',
 '437013339',
 '441237506',
 '442298717',
 '448836141',
 '435805371',
 '448527960',
 '449361688',
 '446045155',
 '435482716',
 '442038982',
 '452768762',
 '436712799',
 '445108706',
 '453919068',
 '436649603',
 '442613874',
 '448483062',
 '448078381',
 '443342355',
 '449153510',
 '443992

In [29]:
uniq = list(set(lanes))

In [31]:
with open("dei_mention_threadnumbers.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(map(str, uniq)))

In [8]:
from lingua import Language, LanguageDetectorBuilder

# Language detector setup
languages = [Language.LATIN, Language.ENGLISH, Language.ITALIAN]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

In [9]:
def latin_italian_exterminator(s):
    latin_confidence_value = detector.compute_language_confidence(s, Language.LATIN)
    latin_cv = float(f"{latin_confidence_value:.2f}") 
    italian_confidence_value = detector.compute_language_confidence(s, Language.ITALIAN)
    italian_cv = float(f"{italian_confidence_value:.2f}")
    return (latin_cv, italian_cv)

def is_latin_or_italian(word):
    if latin_italian_exterminator(word)[0] > 0.55:
        return False
    elif latin_italian_exterminator(word)[1] >0.55: 
        return False
    else: 
        return True

def check_dei_english_status(tokens):
    for i, word in enumerate(tokens):
        if word.lower() == 'dei':
            before = tokens[i-1] if i > 0 else None
            after = tokens[i+1] if i < len(tokens)-1 else None
            if before and after:
                if is_latin_or_italian(before) and is_latin_or_italian(after):
                    return "English"
            # If only before exists and it's Latin
            elif before and is_latin_or_italian(before):
                return "English"
            # If only after exists and it's Latin
            elif after and is_latin_or_italian(after):
                return "English"
            
    return "NotEnglish"

In [27]:
type(da['text'][0][1])

str

In [28]:
da['Tokens'] = da['text'].apply(lambda x: x.split(" "))

In [38]:
da['Lang_Check'] = da['Tokens'].apply(check_dei_english_status) 
dat = da[da['Lang_Check'] == "English"]
root_dei_mentions = [m[:-2] for m in dat['post_id']]

In [41]:
uniq[2]

'412619880'

In [62]:
adf = pd.read_csv("C:\\Users\\emzou\\Desktop\\pol\\txt_splits\\split_csvs\\split_1.csv",encoding = 'utf8')

In [66]:
adf['No'] = adf['No'].astype(str)

In [67]:
print(adf['No'].dtype)

object


In [None]:
adf['Replied_To'] = adf['Replied_To'].apply(str)
adlist = [str(m) for m in adf['No']]

In [73]:
type(adlist[0])

str

In [75]:
adlist[0][0]

'3'

In [74]:
type(uniq[0])

str

In [79]:
uniq[1][0]

'4'

In [80]:
tlist = [m for m in uniq if m in adlist]

In [None]:
def build_downward_tree(post_id, df, depth=0, max_depth=None):
    if max_depth is not None and depth >= max_depth:
        return {"ID": post_id, "Replies": []}  # Stop recursion at max depth
    row = df[df['No'] == post_id]
    if row.empty:
        return {"ID": post_id, "Replies": []}  # No data found for this ID
    text = (
    row.iloc[0]['The_Text'].replace(r"\\n", " ").strip()
    if 'The_Text' in row.columns and isinstance(row.iloc[0]['The_Text'], str)
    else None)
    quoted_by = row.iloc[0]['Quoted_By']
    if not isinstance(quoted_by, list):
        quoted_by = eval(quoted_by) if isinstance(quoted_by, str) else []

    reply_count = len(quoted_by)
    date = row.iloc[0]['Date'] if 'Date' in row.columns else None
    poster_ID = row.iloc[0]['ID'] if 'ID' in row.columns else None

    # Build the tree for each quoted post
    replies = [
        build_downward_tree(reply_id, df, depth=depth + 1, max_depth=max_depth)
        for reply_id in quoted_by
    ]
    return {
        "Poster_Id": poster_ID,
        "ID": post_id,
        "Text": text,
        "Reply_Count": reply_count,
        "Replies": replies,
        "Date": date
    }

def generate_trees(root_ids, df, max_depth=6):
    return [build_downward_tree(root_id, df, depth=0, max_depth=max_depth) for root_id in root_ids]





In [37]:
conversation_trees = generate_trees(tlist, adf, max_depth = 6)

In [38]:
len(conversation_trees)

0

In [116]:
filtered_conversation_trees = [
    tree for tree in conversation_trees
    if 'Text' in tree and re.search(r'\bDEI\b', str(tree['Text']), re.IGNORECASE)
]

In [117]:
len(filtered_conversation_trees)

5983

In [118]:
with open('conversation_tree_baby1_split6.json', 'w') as f:
    json.dump(filtered_conversation_trees, f, indent=4)