In [None]:
import pandas as pd
from pathlib import Path 
import re
from collections import Counter
import nltk 
import string 
nltk.download('punkt')
from statistics import median
from statistics import mean
from lingua import Language, LanguageDetectorBuilder
import spacy
nlp = spacy.load("en_core_web_sm")
from transformers import pipeline
from spacy.pipeline import Sentencizer
from nltk.tokenize import sent_tokenize

# need to downgrade numpy to before 2.0 
# on windows, need to enable long paths : https://www.microfocus.com/documentation/filr/filr-4/filr-desktop/t47bx2ogpfz7.html 
# also need to do through REGEDIT on windows
## if using windows 10, add gpedit.msc this way: https://www.reddit.com/r/AnnoyingTech/comments/ojru3t/adding_gpeditmsc_on_your_windows_home/

In [11]:
## read in the data (if downloading from github, concat the two parts)
#df = pd.read_csv("11_6_fulldataset.csv", index_col= 0)
df1 = pd.read_csv("11_6_fulldatapart1.csv")
df2 = pd.read_csv("11_6_fulldatapart2.csv")
df= pd.concat([df1, df2], ignore_index = True)  

In [12]:
df['Text'] = df['Text'].str.lower()

In [13]:
### cleaning, processing, tagging
## categorizing quoted by 
def process_quotes(s):
    if "Quoted By" in s: 
        return re.findall(r'>>(\d+)\n', s)
    else:
        modified_string = s  # no modification needed if "Quoted By" is not present
        return "No Quote"
df['quotedby'] = df['Identifier'].apply(process_quotes)
## removing it from the text 
def stripper (s): 
    if 'Quoted By' in s:
        cleaned_string = re.sub(r'Quoted By:|>>\d+\n', '', s)
        return cleaned_string.strip()
    else: 
        return s
df ['Text'] = df['Text'].apply(stripper)
## getting the reply-to out 
df['replyto'] = df['Text'].apply(lambda text: re.findall(r'>>(\d+)', text))
df['Text'] = df['Text'].apply(lambda text: re.sub(r'>>\d+\s*', '', text).strip())
# strip website links from the text
# it means 'image of god' in latin 
sitepattern = r'(?:https?://|www\.)\S+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(?:/[^\s]*)?'
df['Text'] = df['Text'].apply(lambda text: re.sub(sitepattern, '', text).strip())
# strip 'imago dei' comments from the text
df = df[~df['Text'].str.contains('imago', case=False, na=False)]
df = df[~df['Text'].str.contains('amplissimus', case=False, na=False)]
# strip Post Reply
postpattern = r'Post\nReply'
df['Text'] = df['Text'].apply(lambda text: re.sub(postpattern, '', text).strip())

In [14]:
# trying to get rid of this pattern for the millionth time 
metapattern = r'.{5}(sameocrgoogleiqdbsaucenaotrace).*'
df['Text'] = df['Text'].apply(lambda text: re.sub(metapattern, '', text ).strip())

In [15]:
# LATIN EXTERMINATION!!! 
# lingua-py (https://github.com/pemistahl/lingua-py)
languages = [Language.LATIN, Language.ENGLISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

def latin_exterminator(s):
    confidence_value = detector.compute_language_confidence(s, Language.LATIN)
    cv = float(f"{confidence_value:.2f}") 
    if cv >= 0.5:
        return None
    else: 
        return s

#use the latin exterminator
df['Text'] = df['Text'].apply(latin_exterminator)
df = df[df['Text'].notnull()]

# drop duplicates by anon-id (this only refers to the post, not the account)
df = df.drop_duplicates(subset = 'anonid', keep = 'last')

In [16]:
## PAUSE: we're doing sentence boundary testing now using pretty sophisticated methods, which take a while. let's use a smaller dataset first to see if it works. 
df = df.head(2000)

In [17]:
## sentence detection => still work in progress 

# Load a spaCy language model (for Sentencizer, a lightweight "blank" model is enough)
nlp = spacy.blank("en")

nlp.add_pipe("sentencizer")  # Simply use the factory name as a string

# Customize the sentencizer to include newline characters as sentence boundaries
nlp.get_pipe("sentencizer").punct_chars = [".", "!", "?", "\n"]

# Define the function using spaCy's sentencizer
def spacy_sentsplit(text):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]  # filter out whitespace-only sentences
    return sentences

# Apply the function to get sentences
df['Spacy_Sentences'] = df['Text'].apply(spacy_sentsplit)

# Get sentence lengths
def spacy_sentsplit_lengths(text):
    sentences = spacy_sentsplit(text)
    return [len(sentence) for sentence in sentences]

df['Spacy_Sentence_Lengths'] = df['Text'].apply(spacy_sentsplit_lengths)

In [18]:
### spacy method: 
nlp = spacy.load("en_core_web_sm", disable = ["ner", "tagger"])
def detect_sentences_spacy_pipe(text): 
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Function to get sentence lengths using spaCy method
def spacy_sentence_lengths(text):
    sentences = detect_sentences_spacy_pipe(text)
    return [len(sentence) for sentence in sentences]

df['Spacy_Sentences'] = df['Text'].apply(detect_sentences_spacy_pipe)
df['Spacy_Sentence_Lengths'] = df['Text'].apply(spacy_sentence_lengths)



In [19]:
# Sentence detection with NLTK's Punkt
def nltk_sentsplit(text):
    sentences = sent_tokenize(text)
    return [sentence.strip() for sentence in sentences]

df['NLTK_Sentences'] = df['Text'].apply(nltk_sentsplit)

# Get sentence lengths with NLTK's Punkt
def nltk_sentsplit_lengths(text):
    sentences = nltk_sentsplit(text)
    return [len(sentence) for sentence in sentences]

df['NLTK_Sentence_Lengths'] = df['Text'].apply(nltk_sentsplit_lengths)

In [None]:
# Initialize a sentence segmentation pipeline using a Hugging Face model
sentence_segmenter = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")

# Sentence detection using Hugging Face Transformers
def transformer_sentsplit(text):
    segments = sentence_segmenter(text)
    # Extract sentences from transformer pipeline output
    return [segment['sentence'].strip() for segment in segments]

df['Transformer_Sentences'] = df['Text'].apply(transformer_sentsplit)

# Get sentence lengths with Hugging Face Transformers
def transformer_sentsplit_lengths(text):
    sentences = transformer_sentsplit(text)
    return [len(sentence) for sentence in sentences]

df['Transformer_Sentence_Lengths'] = df['Text'].apply(transformer_sentsplit_lengths)


In [None]:
def you_a_mismatch(row):
    # Initialize a list to store sentence length tuples across methods
    sentence_lengths = []
    
    # Use zip to pair sentences from all four methods by length
    for regex, spacy, nltk, transformer in zip(
        row['Regex_Sentences'], 
        row['Spacy_Sentences'], 
        row['NLTK_Sentences'], 
        row['Transformer_Sentences']
    ):
        sentence_lengths.append((len(regex), len(spacy), len(nltk), len(transformer)))
    
    # Add any extra sentences from Regex method if longer than others
    sentence_lengths += [
        (len(regex), 0, 0, 0) 
        for regex in row['Regex_Sentences'][len(row['Spacy_Sentences']):]
    ]
    
    # Add extra sentences from SpaCy method if longer than others
    sentence_lengths += [
        (0, len(spacy), 0, 0) 
        for spacy in row['Spacy_Sentences'][len(row['Regex_Sentences']):]
    ]
    
    # Add extra sentences from NLTK method if longer than others
    sentence_lengths += [
        (0, 0, len(nltk), 0) 
        for nltk in row['NLTK_Sentences'][len(row['Regex_Sentences']):]
    ]
    
    # Add extra sentences from Transformer method if longer than others
    sentence_lengths += [
        (0, 0, 0, len(transformer)) 
        for transformer in row['Transformer_Sentences'][len(row['Regex_Sentences']):]
    ]
    
    return sentence_lengths

# Apply this function to the DataFrame to calculate mismatches
df['Sentence_Lengths_Mismatch'] = df.apply(you_a_mismatch, axis=1)

# Display the resulting mismatches column
print(df[['Text', 'Sentence_Lengths_Mismatch']])


In [None]:
def count_mismatches(row):
    # Count mismatches by comparing sentences detected by each method
    mismatch_count = sum(1 for m, s in zip(row['Regex_Sentences'], row['Spacy_Sentences']) if m != s)
    # Add mismatches for any extra sentences in either method
    mismatch_count += abs(len(row['Regex_Sentences']) - len(row['Spacy_Sentences']))
    return mismatch_count

# Apply the function to create a column with the total number of mismatches
df['Total_Mismatches'] = df.apply(count_mismatches, axis=1)

In [None]:
df.to_csv("sentence_peek1_1112.csv")

In [None]:
# testing performance

fdf = df[df['Total_Mismatches'] == 0]

In [None]:
fdf