In [2]:
import collections
from collections import Counter
from collections import defaultdict
import pandas as pd
import numpy as np
import torch
import time 
import random
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import scipy.stats as stats
from scipy.stats import entropy, spearmanr
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import jensenshannon, cosine
import regex as re
import spacy
import string
import os
import cProfile
import pstats
import io
import json
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pd.options.mode.chained_assignment = None
os.environ["TOKENIZERS_PARALLELISM"] = "true"
nlp = spacy.load("en_core_web_lg")
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
device_id = 0 if torch.backends.mps.is_available() else -1
dm = pd.read_csv("gecko_pelts.csv", index_col = 0)
energy_drink = [x for x in dm['word']]
da = pd.read_csv("jan_mar_2023poldumps.csv", index_col = 0)
dg = da.dropna().drop_duplicates(subset="comment")
model_name = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
fill_mask_pipeline = pipeline("fill-mask", model=model_name, device=device_id)

In [4]:
ninety = pd.read_csv("unrecognized_words_90cap.csv", index_col = 0)
ninetylist = [x for x in ninety['word']]

In [5]:
len(energy_drink)

1077

In [6]:
len(ninetylist)

1782

In [7]:
adds = [x for x in ninetylist if x not in energy_drink]

In [8]:
len(adds)

727

In [9]:
def clean_prose(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"^>\s*", "", text)  # Remove leading '>'
    text = re.sub(r">>\d+\s*", "", text)  # Remove quote references
    text = re.sub(r"^[^a-zA-Z]+|[^a-zA-Z.!?]+$", "", text)  # Trim unwanted characters
    text = re.sub(r"\n+", " ", text)  # Replace newlines with spaces
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    text = re.sub(r"http[s]?://[^\s>]+|www\.[^\s>]+", "", text)  # Remove URLs
    return text

def split_sentences(text):
    if not isinstance(text, str):
        return []
    return [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]

def filter_by_word(df, column, word):
    if not isinstance(word, str):
        raise ValueError(f"Invalid word input: expected string, got {type(word)}")
    
    pattern = re.compile(rf'\b{re.escape(word)}\b', re.IGNORECASE)
    return df[df[column].astype(str).str.contains(pattern, na=False)]

def filter_strings_by_word(sentences, word):
    if not isinstance(word, str):
        raise ValueError(f"Invalid word input: expected string, got {type(word)}")
    
    pattern = re.compile(rf'\b{re.escape(word)}\b', re.IGNORECASE)
    return [s for s in sentences if pattern.search(s)]

def is_valid_word(word):
    return bool(re.match(r"^[a-zA-Z]+$", word))

def clean_dataframe(df):
    df.drop_duplicates(subset="comment", keep="first", inplace=True)
    df = df[df["comment"].apply(lambda x: len(str(x).split(" ")) <= 512)]
    return df.reset_index(drop=True) if len(df) >= 100 else pd.DataFrame()

def og_thedf(df, word):
    df["comment"] = df["comment"].apply(clean_prose)
    df = filter_by_word(df, "comment", word)
    df["com_list_it1"] = df["comment"].apply(split_sentences)
    df["sent_with_word"] = df["com_list_it1"].apply(lambda x: filter_strings_by_word(x, word))
    df = df.explode("sent_with_word").drop_duplicates(subset=["comment"]).reset_index(drop=True)
    return df

def getopdf(df, word):
    return og_thedf(df, word)[lambda x: x["op"] == 1]

def group_threads_by_timestamp(df, thread_col="thread_num", num_col="num", comment_col="comment", timestamp_col="timestamp"):
    thread_dict = defaultdict(list)
    for thread, thread_df in df.groupby(thread_col):
        sorted_comments = thread_df.sort_values(by=timestamp_col)[[num_col, comment_col]].to_dict(orient='records')
        thread_dict[thread] = sorted_comments
    return thread_dict

def df_to_thread_dict(df):
    thread_dict = {
        thread: thread_df.sort_values(by="timestamp")[["num", "comment"]].to_dict(orient='records')
        for thread, thread_df in df.groupby("thread_num")
    }
    return thread_dict

def precompute_thread_dict(df):
    thread_dict = {}
    for thread, thread_df in df.groupby("thread_num"):
        thread_dict[thread] = thread_df.sort_values(by="timestamp")[["num", "comment"]].to_dict(orient='records')
    return thread_dict

def pretokenize_comments(thread_dict):
    for thread, posts in thread_dict.items():
        for post in posts:
            post['tokenized_comment'] = set(post['comment'].lower().split())  # Store pre-tokenized words
    return thread_dict

karina = pretokenize_comments(precompute_thread_dict(dg)) ### TURNING DATAFRAME INTO A DICTIONARY OF THREADS!!! 

In [10]:
def filter_and_extract_word_stats(thread_dict, target_word):
    if not isinstance(target_word, str):
        raise ValueError(f"Invalid target_word input: expected string, got {type(target_word)}")
    
    word_pattern = re.compile(rf'\b{re.escape(target_word)}\b', re.IGNORECASE)
    data = []
    
    for thread_id, comments in thread_dict.items():
        appearances = (c for c in comments if word_pattern.search(c['comment']))
        appearances_list = list(appearances)
        if appearances_list:
            first_comment = appearances_list[0]
            
            data.append({
                'thread_id': thread_id,
                'first_appearance': first_comment['comment'],
                'first_id': first_comment['num'],
                'length': len(comments)  # Total number of comments in thread
            })
    
    return pd.DataFrame(data)

def mask_df(a_df, a_word, tokenizer=tokenizer):
    if not isinstance(a_word, str):
        raise ValueError(f"Invalid a_word input: expected string, got {type(a_word)}")
    
    mask_token = tokenizer.mask_token
    word_pattern = re.compile(rf'\b{re.escape(a_word)}\b', re.IGNORECASE)
    
    def masker(text):
        return word_pattern.sub(mask_token, text) if isinstance(text, str) else text
    
    def trim_long_text(text, limit=450):
        if not isinstance(text, str):
            return text
        tokens = tokenizer.tokenize(text)
        if len(tokens) <= limit:
            return text
        words = text.split()
        word_indices = [i for i, w in enumerate(words) if word_pattern.search(w)]
        if not word_indices:
            return " ".join(words[:limit])
        
        target_index = word_indices[0]
        left, right = max(0, target_index - limit // 2), min(len(words), target_index + limit // 2)
        return " ".join(words[left:right])
    
    # Apply Masking and Filtering
    a_df["first_appearance"] = a_df["first_appearance"].apply(masker)
    a_df = a_df[a_df["first_appearance"].str.contains(mask_token, na=False)]
    a_df["first_appearance"] = a_df["first_appearance"].apply(trim_long_text)
    
    return a_df


def get_random_samples(df, trial_num):
    dfs = []
    for i in range(1, trial_num + 1):
        dfs.append(df.sample(n=20, random_state=i))
    return dfs

In [11]:
def accumulate_meaning_unrecognized(df, mask_column, target_word, fill_mask_pipeline=fill_mask_pipeline, tokenizer=tokenizer, top_k=10):
    predictions_list = []
    mask_token = tokenizer.mask_token
    word_pattern = re.compile(rf'\b{re.escape(target_word)}\b', re.IGNORECASE)
    punctuation_set = set(string.punctuation)
    df[mask_column] = df[mask_column].astype(str)

    sentences = df[mask_column].str.strip().tolist()
    processed_sentences = [word_pattern.sub(mask_token, sentence.lower()) for sentence in sentences if sentence]

    # Skip sentences longer than 512 tokens and print a warning
    filtered_sentences = []
    for sentence in processed_sentences:
        if len(tokenizer.tokenize(sentence)) > 512:
            print(f"Skipping sentence (exceeds 512 tokens): {sentence[:100]}...")
            continue
        filtered_sentences.append(sentence)
    
    if not filtered_sentences:
        return predictions_list  # Return empty if no valid sentences remain

    # Batch tokenization with GPU support
    tokenized_sentences = tokenizer.batch_encode_plus(filtered_sentences, padding=True, truncation=True, return_tensors="pt")
    tokenized_sentences = {k: v.to(device) for k, v in tokenized_sentences.items()}  # Move to GPU if available

    # **Batch Inference Optimization with torch.no_grad()**
    with torch.no_grad():
        predictions_batch = fill_mask_pipeline(filtered_sentences)  # **Send all at once instead of looping**
    
    for predictions in predictions_batch:
        if not isinstance(predictions, list) or 'token_str' not in predictions[0]:
            continue

        filtered_predictions = {
            p['token_str'].strip(): np.log(p['score'])
            for p in predictions
            if 'token_str' in p and not all(char in punctuation_set for char in p['token_str'])
        }

        if not filtered_predictions:
            continue

        log_probs = np.array(list(filtered_predictions.values()))
        max_log_prob = np.max(log_probs)
        exp_probs = np.exp(log_probs - max_log_prob)
        normalized_probs = dict(zip(filtered_predictions.keys(), exp_probs / exp_probs.sum()))

        predictions_list.append(normalized_probs)

    return predictions_list

In [12]:
def get_pos(word):
    doc = nlp(word)
    return doc[0].pos_ if doc and doc[0].pos_ else "UNKNOWN"

def get_first_guess(dict_list):
    guess_counts = Counter()
    for d in dict_list:
        if d:
            best_guess = max(d, key=d.get)  # Get the word with the highest probability
            guess_counts[best_guess] += 1
    return guess_counts.most_common(1)[0][0] if guess_counts else None

def cosine_similarity(p, q):
    all_keys = set(p.keys()).union(set(q.keys()))
    p_vec = np.array([p.get(k, 0) for k in all_keys])
    q_vec = np.array([q.get(k, 0) for k in all_keys])
    
    if np.all(p_vec == 0) or np.all(q_vec == 0):  # Avoid division by zero
        return 0.0
    return 1 - cosine(p_vec, q_vec)

def wasserstein_similarity(p, q):
    all_keys = list(set(p.keys()).union(set(q.keys())))
    p_vec = np.array([p.get(k, 0) for k in all_keys])
    q_vec = np.array([q.get(k, 0) for k in all_keys])

    return wasserstein_distance(p_vec, q_vec)

def kl_divergence(p, q):
    p = np.array(list(p.values())) + 1e-10
    q = np.array(list(q.values())) + 1e-10
    return stats.entropy(p, q)

def js_divergence(p, q):
    p = np.array(list(p.values())) + 1e-10
    q = np.array(list(q.values())) + 1e-10
    m = 0.5 * (p + q)
    return 0.5 * (stats.entropy(p, m) + stats.entropy(q, m))

def entropy(p):
    p = np.array(list(p.values())) + 1e-10
    return stats.entropy(p)

def rank_stability(dict_list):
    ranks = [{k: rank for rank, (k, _) in enumerate(sorted(d.items(), key=lambda x: -x[1]))} for d in dict_list]
    diffs = [sum(abs(ranks[i][k] - ranks[i-1].get(k, len(ranks[i]))) for k in ranks[i]) for i in range(1, len(ranks))]
    return np.mean(diffs)

def jaccard_similarity(p, q, k=5):
    top_p = set(sorted(p, key=p.get, reverse=True)[:k])
    top_q = set(sorted(q, key=q.get, reverse=True)[:k])
    return len(top_p & top_q) / len(top_p | top_q)

def most_common_pos(dict_list):
    pos_counts = Counter()
    for d in dict_list:
        for word in d.keys():
            pos_counts[get_pos(word)] += 1
    return pos_counts.most_common(1)[0][0]

def compute_metrics(dict_list):
    if not dict_list or len(dict_list) < 2:
        return {} 

    metrics = {
        'Cosine_Similarity': np.mean([cosine_similarity(dict_list[i], dict_list[i-1]) for i in range(1, len(dict_list))]),
        'Wasserstein_Distance': np.mean([wasserstein_similarity(dict_list[i], dict_list[i-1]) for i in range(1, len(dict_list))]),
        'Entropy' : np.mean([entropy(dict_list[i]) for i in range(1, len(dict_list))]),
        'Entropy_Diff': np.mean([entropy(dict_list[i]) - entropy(dict_list[i-1]) for i in range(1, len(dict_list))]),
        'Rank_Stability': rank_stability(dict_list),
        'Temporal_Jaccard': np.mean([jaccard_similarity(dict_list[i], dict_list[i-1]) for i in range(1, len(dict_list))]),
        'Most_Common_POS': most_common_pos(dict_list),
        'First_Guess': get_first_guess(dict_list)
    }
    return metrics

In [13]:
def aggregate_metrics(trial_metrics_list):
    trial_metrics_list = [trial for trial in trial_metrics_list if trial]

    if not trial_metrics_list:  # If all trials were empty, return an empty dict
        print("Warning: No valid trials found for aggregation.")
        return {}

    aggregated = {}
    numerical_keys = ['Cosine_Similarity', 'Wasserstein_Distance', 'Entropy', 'Entropy_Diff', 'Rank_Stability', 'Temporal_Jaccard']
    
    for key in numerical_keys:
        try:
            values = [trial[key] for trial in trial_metrics_list if key in trial] 
            if values:
                aggregated[key] = {
                    'Mean': np.mean(values),
                    'STD': np.std(values, ddof=1),
                    'CI_95': (np.mean(values) - 1.96 * (np.std(values, ddof=1) / np.sqrt(len(values))),
                              np.mean(values) + 1.96 * (np.std(values, ddof=1) / np.sqrt(len(values))))
                }
        except KeyError as e:
            print(f"Skipping {key} due to missing data: {e}")

    pos_counts = Counter(trial['Most_Common_POS'] for trial in trial_metrics_list if 'Most_Common_POS' in trial)
    aggregated['Most_Common_POS'] = pos_counts.most_common(1)[0][0] if pos_counts else None

    first_guess_counts = Counter(trial['First_Guess'] for trial in trial_metrics_list if 'First_Guess' in trial)
    aggregated['First_Guess_Guess'] = first_guess_counts.most_common(1)[0][0] if first_guess_counts else None

    return aggregated

In [14]:
def process_word_trials(words, dataframe, n):
    results = []
    total_words = len(words)
    start_time = time.time()

    for idx, word in enumerate(words, 1):
        word_start_time = time.time()
        print(f"Processing word {idx}/{total_words}: {word}")

        covid = filter_and_extract_word_stats(dataframe, word)
        maskcovid = mask_df(covid, word)
        the_df = get_random_samples(maskcovid, n)

        first_appearance_dict_list = [accumulate_meaning_unrecognized(trial_df, 'first_appearance', word) for trial_df in the_df]

        # Drop word if there's not enough data
        if not any(first_appearance_dict_list) or all(len(trial) < 2 for trial in first_appearance_dict_list):
            print(f"Skipping word '{word}' due to insufficient data.")
            continue  # skip 

        first_metrics = [compute_metrics(trial) for trial in first_appearance_dict_list if trial]  
        first_aggregated = aggregate_metrics(first_metrics)

        row = {'Word': word}
        for metric, values in first_aggregated.items():
            if isinstance(values, dict):
                row.update({f'First_{metric}_Mean': values['Mean'],
                            f'First_{metric}_STD': values['STD'],
                            f'First_{metric}_CI': values['CI_95']})
            else:
                row[f'First_{metric}'] = values

        results.append(row)
        elapsed_time = time.time() - word_start_time
        remaining_time = (time.time() - start_time) / idx * (total_words - idx)
        print(f"Completed {word} in {elapsed_time:.2f} seconds. Estimated time remaining: {remaining_time:.2f} seconds.")

    return pd.DataFrame(results)


In [15]:
len(energy_drink) #1077
sublists = [adds[i:i + 100] for i in range(0, len(adds), 100)]

In [16]:
def process_and_save_sublists(the_sublists, dataframe, n_trials, folder_name="first_round"):
    os.makedirs(folder_name, exist_ok=True)  # Create the folder if it doesn't exist
    
    for idx, sublist in enumerate(the_sublists, 1):
        print(f"Processing sublist {idx}/10...")
        df = process_word_trials(sublist, dataframe, n_trials)
        file_path = os.path.join(folder_name, f"first_roundv2_sublist_{idx}.csv")
        df.to_csv(file_path, index=False)
        print(f"Saved {file_path}")
    
    print("All sublists processed and saved.")

In [None]:
process_and_save_sublists(sublists, karina, 5)

In [None]:
#started feb 18 10:28PM 
#process_and_save_sublists(sublists, karina, 5)
# error thrown at 11:18PM on sublist 2, NaN value in compute_metrics, changed to return empty dict instead of none
# error thrown at 11:31 PM on sublist 2
# error thrown at 11:41PM on sublist 2
#sublists2 = sublists[1:]
#process_and_save_sublists(sublists2, karina, 5)
# rerun on feb 19 10:11AM to recapture 'first_guess' 
# added 'entropy' metric along with 'entropy_diff' 
# feb 19 6PM => testing changes if we change the cap to 90 threads instead of 176