In [None]:
import collections
from collections import Counter
from collections import defaultdict
import pandas as pd
import numpy as np
import torch
import time 
from functools import lru_cache
import random
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import scipy.stats as stats
from scipy.stats import entropy, spearmanr
from scipy.stats import wasserstein_distance
from scipy.stats import entropy as scipy_entropy
from scipy.stats import entropy, kurtosis, skew
from scipy.spatial.distance import jensenshannon, cosine
import regex as re
import spacy
import string
import os
import cProfile
import pstats
import io
import json

In [None]:
# if you end up doing this, let me know about your machine specs and i can help you figure it out. im actually not sure 
# how much slower this would be on a CPU, i can also ask jen to maybe give you gcp access? (i haven't gotten around to figuring it myself)
pd.options.mode.chained_assignment = None
os.environ["TOKENIZERS_PARALLELISM"] = "true"
nlp = spacy.load("en_core_web_lg")
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
device_id = 0 if torch.backends.mps.is_available() else -1

model_name = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
fill_mask_pipeline = pipeline("fill-mask", model=model_name, device=device_id)

In [None]:
du = pd.read_csv("pol_jan_mar_2023.csv")  #input the correct dataframe here 
du['timestamp'] = du['converted_timestamp'].apply(pd.to_datetime) # i forgot why i needed to do this might not need 
print (len(du))

# also load in the correct files
with open("cut_recognized_words.txt", "r", encoding="utf-8") as file:
    recognized_words_list = [line.strip() for line in file]

print (len(recognized_words_list))

with open("cut_unrecognized_words.txt", "r", encoding = "utf-8") as file: 
    unrecognized_words_list = [line.strip() for line in file]

print (len(unrecognized_words_list))

rec_sublist = [recognized_words_list[i:i + 500] for i in range(0, len(recognized_words_list), 500)] #just so we dont have a million outputs
unrec_sublist = [unrecognized_words_list[i:i + 50] for i in range(0, len(unrecognized_words_list), 50)]

function_words_spacy = sorted(nlp.Defaults.stop_words)
function_words_spacy = set()
for word in nlp.Defaults.stop_words:  # Use spaCy's stop words
    token = nlp(word)[0]  # Process the word
    if token.pos_ in {"PRON", "DET", "ADP", "CCONJ", "SCONJ", "AUX", "PART", "ADV", "INTJ"}:
        function_words_spacy.add(word)

function_words_spacy = sorted(function_words_spacy)

In [None]:
# make a thread dictionary
def df_to_thread_dict(df):
    thread_dict = {
        thread: thread_df.sort_values(by="timestamp")[["num", "comment"]].to_dict(orient='records')
        for thread, thread_df in df.groupby("thread_num")
    }
    return thread_dict

def precompute_thread_dict(df):
    thread_dict = {}
    for thread, thread_df in df.groupby("thread_num"):
        thread_dict[thread] = thread_df.sort_values(by="timestamp")[["num", "comment"]].to_dict(orient='records')
    return thread_dict

def pretokenize_comments(thread_dict):
    for thread, posts in thread_dict.items():
        for post in posts:
            post['tokenized_comment'] = set(post['comment'].lower().split())  # Store pre-tokenized words
    return thread_dict

karina = pretokenize_comments(precompute_thread_dict(du))

In [None]:
# could also look into getting past first appearances (i just chose it for uniformity, but we might expect variation so could be cool)
def filter_and_extract_word_stats(thread_dict, target_word):
    if not isinstance(target_word, str):
        raise ValueError(f"Invalid target_word input: expected string, got {type(target_word)}")
    word_pattern = re.compile(rf'\b{re.escape(target_word)}\b', re.IGNORECASE)
    data = []
    for thread_id, comments in thread_dict.items():
        appearances = (c for c in comments if word_pattern.search(c['comment']))
        appearances_list = list(appearances)
        if appearances_list:
            first_comment = appearances_list[0]
            data.append({
                'thread_id': thread_id,
                'first_appearance': first_comment['comment'],
                'first_id': first_comment['num'],
                'length': len(comments)})
    return pd.DataFrame(data)

def mask_df(a_df, a_word, tokenizer = tokenizer):
    if not isinstance(a_word, str):
        raise ValueError(f"Invalid a_word input: expected string, got {type(a_word)}")
    # Ensure 'first_appearance' exists before processing
    if "first_appearance" not in a_df.columns:
        print(f"Skipping word '{a_word}': 'first_appearance' column not found.")
        return pd.DataFrame()  # Return an empty DataFrame to indicate skipping
    mask_token = tokenizer.mask_token
    word_pattern = re.compile(rf'\b{re.escape(a_word)}\b', re.IGNORECASE)

    def masker(text):
        return word_pattern.sub(mask_token, text) if isinstance(text, str) else text

    def trim_long_text(text, limit=450):  
        if not isinstance(text, str):
            return text
        tokens = tokenizer.tokenize(text)
        if len(tokens) <= limit:
            return text
        words = text.split()
        word_indices = [i for i, w in enumerate(words) if word_pattern.search(w)]
        if not word_indices:
            return " ".join(words[:limit])
        target_index = word_indices[0]
        left, right = max(0, target_index - limit // 2), min(len(words), target_index + limit // 2)
        return " ".join(words[left:right])

    # Convert column to string before applying transformations
    a_df = a_df.copy()
    a_df["first_appearance"] = a_df["first_appearance"].astype(str)

    # Apply transformations
    a_df["first_appearance"] = a_df["first_appearance"].apply(masker)
    a_df = a_df[a_df["first_appearance"].str.contains(mask_token, na=False)]
    if a_df.empty:
        print(f"Skipping word '{a_word}': No instances found after masking.")
        return pd.DataFrame()  # Return empty DataFrame if nothing remains
    a_df["first_appearance"] = a_df["first_appearance"].apply(trim_long_text)
    return a_df

def get_random_samples(df, trial_num, sample_size=12):
    df_list = []
    for i in range(1, trial_num + 1):
        sampled_df = df.sample(n=sample_size, random_state=i).copy()
        sampled_df["Trial"] = i  # Mark each row with its trial number
        df_list.append(sampled_df)
    return pd.concat(df_list, ignore_index=True)  

In [None]:
def accumulate_meaning_unrecognized(
    df, mask_column, target_word, fill_mask_pipeline=fill_mask_pipeline, 
    tokenizer=tokenizer, top_k=5, prob_threshold=0.8
):
    predictions_list = []
    mask_token = tokenizer.mask_token
    word_pattern = re.compile(rf'\b{re.escape(target_word)}\b', re.IGNORECASE)
    punctuation_set = set(string.punctuation)
    df[mask_column] = df[mask_column].astype(str)
    sentences = df[mask_column].str.strip().tolist()
    trial_ids = df["Trial"].tolist()
    missing_penalty = np.log(1e-8)
    for trial_id in set(trial_ids):
        all_seen_words = set()
        trial_sentences = [word_pattern.sub(mask_token, sentence.lower()) for sentence, t_id in zip(sentences, trial_ids) if t_id == trial_id]
        trial_predictions = {}
        for sentence in trial_sentences:
            if mask_token not in sentence:
                print(f"Skipping sentence due to missing mask token: {sentence[:50]}...")
                continue
            try:
                tokenized_sentence = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt").to(device)
                with torch.no_grad():
                    predictions = fill_mask_pipeline(sentence)
            except RuntimeError as e:
                if "expanded size of the tensor" in str(e):
                    print(f"Skipping sentence due to tensor size mismatch: {sentence[:50]}...")
                    continue
                else:
                    raise e
            if not isinstance(predictions, list) or 'token_str' not in predictions[0]:
                continue
            # !!!!!!! Extract log probabilities
            filtered_predictions = {
                p['token_str'].strip(): np.log(p['score'])
                for p in predictions
                if 'token_str' in p and not all(char in punctuation_set for char in p['token_str'])
            }
            if not filtered_predictions:
                continue
            # Sort candidates by probability
            sorted_candidates = sorted(filtered_predictions.items(), key=lambda x: x[1], reverse=True)
            max_log_prob = sorted_candidates[0][1]
            # Keep at least top-k, and include words within the threshold
            top_candidates = {k: v for k, v in sorted_candidates[:top_k]}
            for word, log_prob in sorted_candidates[top_k:]:
                if np.exp(log_prob - max_log_prob) >= prob_threshold:  
                    top_candidates[word] = log_prob
            # Normalize within chosen candidates
            log_probs = np.array(list(top_candidates.values()))
            exp_probs = np.exp(log_probs - max_log_prob)
            normalized_probs = dict(zip(top_candidates.keys(), exp_probs / exp_probs.sum()))
            all_seen_words.update(normalized_probs.keys())
            # Accumulate seen words and allow them to gain weight over time
            for word in all_seen_words:
                if word in normalized_probs:
                    trial_predictions[word] = trial_predictions.get(word, 0) + normalized_probs[word]
                else:
                    trial_predictions[word] = trial_predictions.get(word, 0) + np.exp(missing_penalty)  
        total_prob = sum(trial_predictions.values())
        if total_prob > 0:
            trial_predictions = {k: v / total_prob for k, v in trial_predictions.items()}
        predictions_list.append(trial_predictions)
    return predictions_list

In [None]:
# metrics === you can also look into additional metrics we might add 
def get_pos(word):
    doc = nlp(word)
    return doc[0].pos_ if doc and doc[0].pos_ else "UNKNOWN"

def mean_reciprocal_rank(dict_list):
    reciprocal_ranks = []
    for d in dict_list:
        if d:
            sorted_items = sorted(d.items(), key=lambda x: -x[1])
            top_word = sorted_items[0][0]
            rank = next((i + 1 for i, (word, _) in enumerate(sorted_items) if word == top_word), len(sorted_items))
            reciprocal_ranks.append(1 / rank)
    return np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0

def prediction_diversity(dict_list, k=5):
    unique_predictions = set()
    for d in dict_list:
        top_k_words = sorted(d, key=d.get, reverse=True)[:k]
        unique_predictions.update(top_k_words)
    return len(unique_predictions) / (k * len(dict_list))

def word_rank_volatility(dict_list):
    word_positions = {}
    for trial_idx, d in enumerate(dict_list):
        sorted_words = sorted(d, key=d.get, reverse=True)
        for rank, word in enumerate(sorted_words):
            if word not in word_positions:
                word_positions[word] = []
            word_positions[word].append(rank)
    rank_diffs = [np.mean(np.abs(np.diff(positions))) for positions in word_positions.values() if len(positions) > 1]
    return np.mean(rank_diffs) if rank_diffs else 0.0

def rank_stability(dict_list):
    ranks = [{k: rank for rank, (k, _) in enumerate(sorted(d.items(), key=lambda x: -x[1]))} for d in dict_list]
    diffs = [sum(abs(ranks[i][k] - ranks[i-1].get(k, len(ranks[i]))) for k in ranks[i]) for i in range(1, len(ranks))]
    return np.mean(diffs)

def temporal_kl_divergence(dict_list):
    kl_divs = [kl_divergence(dict_list[i], dict_list[i-1]) for i in range(1, len(dict_list))]
    return np.mean(kl_divs) if kl_divs else 0.0

def first_guess_consistency(dict_list):
    first_guesses = [get_first_guess([d]) for d in dict_list]
    most_common_guess, count = Counter(first_guesses).most_common(1)[0]
    return count / len(first_guesses)

def distribution_compression(dict_list):
    entropies = [entropy(d) for d in dict_list]
    return np.mean(entropies) if entropies else 0.0

def get_first_guess(dict_list):
    guess_counts = Counter()
    for d in dict_list:
        if d:
            best_guess = max(d, key=d.get)  # Get the word with the highest probability
            guess_counts[best_guess] += 1
    return guess_counts.most_common(1)[0][0] if guess_counts else None

def cosine_similarity(p, q):
    all_keys = set(p.keys()).union(set(q.keys()))
    p_vec = np.array([p.get(k, 0) for k in all_keys])
    q_vec = np.array([q.get(k, 0) for k in all_keys])
    
    if np.all(p_vec == 0) or np.all(q_vec == 0):  # Avoid division by zero
        return 0.0
    return 1 - cosine(p_vec, q_vec)

def wasserstein_similarity(p, q):
    all_keys = list(set(p.keys()).union(set(q.keys())))
    p_vec = np.array([p.get(k, 0) for k in all_keys])
    q_vec = np.array([q.get(k, 0) for k in all_keys])

    return wasserstein_distance(p_vec, q_vec)

def kl_divergence(p, q):
    all_keys = set(p.keys()).union(set(q.keys()))  # Get all unique words across both distributions
    p_vec = np.array([p.get(k, 1e-10) for k in all_keys])  # Fill missing words with small value
    q_vec = np.array([q.get(k, 1e-10) for k in all_keys])  # Same for q
    
    return stats.entropy(p_vec, q_vec)  # Now both arrays have same shape

def js_divergence(p, q):
    p = np.array(list(p.values())) + 1e-10
    q = np.array(list(q.values())) + 1e-10
    m = 0.5 * (p + q)
    return 0.5 * (stats.entropy(p, m) + stats.entropy(q, m))

def entropy(p):
    p = np.array(list(p.values())) + 1e-10
    return stats.entropy(p)

def jaccard_similarity(p, q, k=5):
    top_p = set(sorted(p, key=p.get, reverse=True)[:k])
    top_q = set(sorted(q, key=q.get, reverse=True)[:k])
    return len(top_p & top_q) / len(top_p | top_q)

def most_common_pos(dict_list):
    pos_counts = Counter()
    for d in dict_list:
        for word in d.keys():
            pos_counts[get_pos(word)] += 1
    return pos_counts.most_common(1)[0][0]

def content_score(word_distribution, function_words_set = function_words_spacy):
    function_prob = sum(prob for word, prob in word_distribution.items() if word in function_words_set)
    return 1 - function_prob  # Higher means more content words


def compute_metrics(dict_list, function_words_set = function_words_spacy):
    if not dict_list or len(dict_list) < 2:
        return {} 

    metrics = {
        'Cosine_Similarity': np.mean([cosine_similarity(dict_list[i], dict_list[i-1]) for i in range(1, len(dict_list))]),
        'Wasserstein_Distance': np.mean([wasserstein_similarity(dict_list[i], dict_list[i-1]) for i in range(1, len(dict_list))]),
        'Rank_Stability': rank_stability(dict_list),
        'Temporal_Jaccard': np.mean([jaccard_similarity(dict_list[i], dict_list[i-1]) for i in range(1, len(dict_list))]),
        'Most_Common_POS': most_common_pos(dict_list),
        'First_Guess': get_first_guess(dict_list),
        #'Mean_Reciprocal_Rank': mean_reciprocal_rank(dict_list),
        'Prediction_Diversity': prediction_diversity(dict_list),
        'Temporal_KL_Divergence': temporal_kl_divergence(dict_list),
        'First_Guess_Consistency': first_guess_consistency(dict_list),
        'Distribution_Compression': distribution_compression(dict_list),
        'Content_Score': np.mean([content_score(d, function_words_spacy) for d in dict_list]),  # NEW METRIC
    }
    return metrics

In [None]:
def process_word_trials(words, dataframe, n):
    results = []
    total_words = len(words)
    start_time = time.time()
    for idx, word in enumerate(words, 1):
        word_start_time = time.time()
        print(f"Processing word {idx}/{total_words}: {word}")
        covid = filter_and_extract_word_stats(dataframe, word)
        maskcovid = mask_df(covid, word)
        trial_df = get_random_samples(maskcovid, n)
        first_appearance_dicts = accumulate_meaning_unrecognized(trial_df, 'first_appearance', word)
        if not first_appearance_dicts or all(len(trial) < 2 for trial in first_appearance_dicts):
            print(f"Skipping word '{word}' due to insufficient data.")
            continue  
        all_words = set().union(*first_appearance_dicts)
        aligned_dicts = [{word: trial.get(word, 1e-10) for word in all_words} for trial in first_appearance_dicts]
        row = {'Word': word}
        metrics = compute_metrics(aligned_dicts)
        for metric_name, metric_value in metrics.items():
            row[f'First_{metric_name}'] = metric_value  
        results.append(row)
        elapsed_time = time.time() - word_start_time
        remaining_time = (time.time() - start_time) / idx * (total_words - idx)
        print(f"Completed {word} in {elapsed_time:.2f} seconds. Estimated time remaining: {remaining_time:.2f} seconds.")
    return pd.DataFrame(results)

In [None]:
def process_and_save_sublists(the_sublists, dataframe, n_trials, folder_name="known_round6", start_idx=1):
    if not os.path.exists(folder_name):  
        os.makedirs(folder_name)  # Create folder once
    for idx, sublist in enumerate(the_sublists, 1):  
        if idx < start_idx:
            continue  # Skip processing until reaching the desired starting index (sometimes I just paused it to do other stuff)
        print(f"Processing sublist {idx}")
        processed_words = []  # Store processed words to rebuild the dataframe
        for word in sublist:
            try:
                word_df = process_word_trials([word], dataframe, n_trials)  # Process one word at a time
                if isinstance(word_df, pd.DataFrame) and not word_df.empty:
                    processed_words.append(word_df)  # Append successful results
                else:
                    print(f"Skipping word '{word}' due to empty dataframe.")
            except ValueError as e:
                print(f"Skipping word '{word}' due to error: {e}")  # Skip only the word, not the sublist
        if not processed_words:
            print(f"Skipping sublist {idx} as all words failed.")
            continue  # Skip saving if no words were successfully processed
        df = pd.concat(processed_words, ignore_index=True)  # Merge valid results
        file_path = os.path.join(folder_name, f"6pmknown_round6_sublist_{idx}.csv")
        df.to_csv(file_path, index=False)

In [None]:
process_and_save_sublists(rec_sublist, karina, 10, start_idx=1)
# actually running everything :^D 