In [None]:
import pandas as pd 
import numpy as np
import json
from collections import Counter
import spacy
from itertools import chain 
from collections import OrderedDict
import re
import string
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from collections import defaultdict
from scipy.stats import entropy
from transformers import AutoTokenizer
from transformers import pipeline
unmasker = pipeline("fill-mask", model="bert-base-uncased")
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
stopwordlist = stopwords.words('english')

In [None]:
#processing functions 
def transform_dict(data):
    if "text" in data:  
        match = re.search(r'>>(\d+)', data["text"]) 
        if match:
            data["replyto"] = match.group(1)  
            data["text"] = re.sub(r'>>\d+', '', data["text"]).strip() 
    return data

def convert_key_to_string(dict_list, key):
    for d in dict_list:
        if key in d and isinstance(d[key], int):
            d[key] = str(d[key])  # Convert the value to a string
    return dict_list

# functions to get the threads with the word we are interested in 
def get_no_if_text_in_string(lst, string):
    pattern = rf'\b{re.escape(string)}\b'
    return [d['no'] for d in lst if 'text' in d and re.search(pattern, d['text'])]

In [None]:
# read in the datasets 
## the full dictionary w/ all post data
with open ('sets/cleanarchive.json', 'r') as file: 
    data = json.load(file)
v = list(chain.from_iterable(data)) 
a = [transform_dict(x) for x in v]
a = [x for x in a if isinstance(x, dict)]
b = convert_key_to_string(a, 'no')

## a list of the post's ID in thread order. each list represents a thread
with open('nested_list.txt', 'r') as f:
    me_loaded = json.load(f)
me_flat = [item for sublist in me_loaded for item in sublist]
me_moop = [[str(x) for x in l] for l in me_flat]

In [None]:
def get_text_from_id(a_id_number):
    the_text = [x['text'] for x in b if x['no'] == a_id_number]
    return the_text

def get_full_and_mentions(a_word):
    lista = get_no_if_text_in_string(a, a_word)
    list1 = [str(x) for x in lista]
    def get_consecutive_lists(a_word):
        return [
            sublist for sublist in me_moop
            if isinstance(sublist, list) and any(
                list1[i] in sublist and list1[i + 1] in sublist
                for i in range(len(list1) - 1)
            )
        ]
    def get_word_ids(the_thread):
        return [item for item in the_thread if item in list1]
    
    def filter_lists_by_last_id(data):
        return list({lst[-1]: lst for lst in sorted(data, key=len, reverse=True) if lst}.values())  
    conseclist = get_consecutive_lists(a_word)
    filtered_conseclist = filter_lists_by_last_id(conseclist)
    threads_and_id = list(map(get_word_ids, filtered_conseclist))
    data = list(zip(filtered_conseclist, threads_and_id))
    df = pd.DataFrame(data, columns=['Full Threads', 'Has Words'])
    df['Word_Pos'] = df.apply(lambda x: [x['Full Threads'].index(m) for m in x['Has Words']], axis=1)
    df['Convo_Length'] = df['Full Threads'].apply(len)
    df['ID'] = df.index
    df['sentences'] = df['Has Words'].apply(lambda x: [get_text_from_id(str(m)) for m in x])
    return df

def preprocess_sentences(sentences, target_word):
    def normalize(text):
        table = str.maketrans('', '', string.punctuation)
        return text.translate(table).lower()
    normalized_target = normalize(target_word)
    masked_sentences = []
    for sentence in sentences:
        words = sentence.split()
        regex = rf'\b{re.escape(normalized_target)}\b'
        if re.search(regex, normalize(" ".join(words))):
            masked_sentence = " ".join("[MASK]" if re.fullmatch(regex, normalize(word)) else word for word in words)
            masked_sentences.append(masked_sentence)
    return masked_sentences

def analyze_top1_and_similarity(masked_sentences, stopwords, max_length=512, top_k=10):
    tokenizer = unmasker.tokenizer
    top1_candidates = []
    sentence_candidates = []
    pos_similarity_scores = []
    similarity_scores = []
    
    for sentence in masked_sentences:
        if "[MASK]" not in sentence:
            continue
        tokenized = tokenizer.encode(sentence, add_special_tokens=True)
        if len(tokenized) > max_length:
            continue
        predictions = unmasker(sentence)
        if not (isinstance(predictions, list) and all(isinstance(pred, dict) for pred in predictions)):
            continue
        valid_candidates = [
            pred for pred in predictions 
            if pred['token_str'].strip() not in string.punctuation and pred['token_str'].lower() not in stopwords
        ]
        filtered_candidates = {pred['token_str']: pred['score'] for pred in valid_candidates}
        sentence_candidates.append(filtered_candidates)
        
        if filtered_candidates:
            top1_word = max(filtered_candidates, key=filtered_candidates.get)
            top1_candidates.append(f"{top1_word}: {filtered_candidates[top1_word]}")
            
            # Determine POS of the top1_word and the [MASK] position
            doc = nlp(sentence.replace("[MASK]", top1_word))
            masked_index = [i for i, token in enumerate(doc) if token.text == top1_word]
            if masked_index:
                masked_token_pos = doc[masked_index[0]].pos_
                pos_scores = [
                    pred['score'] for pred in predictions
                    if nlp(pred['token_str'])[0].pos_ == masked_token_pos
                ]
                pos_similarity_scores.append(max(pos_scores) if pos_scores else 0)
        else:
            top1_candidates.append("No valid prediction")
            pos_similarity_scores.append(0)
    
    for i in range(len(sentence_candidates)):
        for j in range(i + 1, len(sentence_candidates)):
            dist_i = sentence_candidates[i]
            dist_j = sentence_candidates[j]
            common_words = set(dist_i.keys()).intersection(set(dist_j.keys()))
            if not common_words:
                similarity_scores.append(0)
                continue
            score_diff = np.mean([abs(dist_i[word] - dist_j[word]) for word in common_words])
            similarity_scores.append(1 - score_diff)
    
    avg_similarity = np.mean(similarity_scores) if similarity_scores else 0
    avg_pos_similarity = np.mean(pos_similarity_scores) if pos_similarity_scores else 0
    
    return {
        "Top-1 Candidates": top1_candidates,
        "Average Similarity": avg_similarity,
        "Average POS Similarity": avg_pos_similarity
    }

def word_to_df(the_word):
    dataf = get_full_and_mentions(the_word)
    dataf['maskedsent'] = dataf['sentences'].apply(lambda x: [preprocess_sentences(m, the_word) for m in x])
    dataf['mask'] = dataf['maskedsent'].apply(lambda x: list(chain.from_iterable(x))) 
    dataf['Similarity'] = dataf['mask'].apply(lambda x: analyze_top1_and_similarity(x, stopwordlist))
    dataf['Sim_Score'] = dataf['Similarity'].apply(lambda x: x['Average Similarity'])
    dataf['POS_Sim_Score'] = dataf['Similarity'].apply(lambda x: x['Average POS Similarity'])
    return dataf

In [None]:
fulldf = word_to_df("gay")

In [None]:
#view df 
fulldf

In [None]:
# look at the overall conversation length distributions with conversations in which ...the word... appeared 
def plot_side_by_side(data):
    fiona_counts = [len(x) for x in data['Full Threads']]
    filtered_data = data[data['Convo_Length'] > 5]
    filtered_fiona_counts = [len(x) for x in filtered_data['Full Threads']]
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.hist(fiona_counts, bins=30, edgecolor='black', alpha=0.7)
    plt.title('All Conversations')
    plt.xlabel('Length')
    plt.ylabel('Frequency')
    plt.subplot(1, 2, 2)
    plt.hist(filtered_fiona_counts, bins=30, edgecolor='black', alpha=0.7)
    plt.title('Conversations (Convo_Length > 5)')
    plt.xlabel('Length')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

plot_side_by_side(fulldf)

In [None]:
# look at where each word occurrence... occurred in the entire conversation thread
def visualize_word_positions(dataframe):
    dataframe = dataframe.sort_values(by='Convo_Length', ascending=True).reset_index(drop=True)
    fig, ax = plt.subplots(figsize=(12, 6))
    for idx, row in dataframe.iterrows():
        word_positions = row['Word_Pos']
        convo_length = row['Convo_Length'] + 1
        if len(word_positions) > 1:
            ax.plot(word_positions, [idx] * len(word_positions), color='lightblue', alpha=0.6, linewidth=1)
        ax.plot([0, convo_length - 1], [idx, idx], color='orange', alpha=0.3, linewidth=0.5)
        ax.scatter(word_positions, [idx] * len(word_positions), color='lightblue', s=8)
    ax.set_xlim(0, dataframe['Convo_Length'].max() + 1)
    ax.set_ylim(-1, len(dataframe))
    ax.set_xlabel('Conversation Index')
    ax.set_ylabel('Conversation ID (Ordered by Convo Length)')
    ax.set_title('Word Positions Across Conversations')
    plt.tight_layout()
    plt.show()

visualize_word_positions(fulldf)

In [None]:
# only look at conversations where ... the word... appeared consecutively (in direct conversation)
def is_consecutive(numbers):
    return all(b - a == 1 for a, b in zip(numbers, numbers[1:]))
da = fulldf[fulldf['Word_Pos'].apply(is_consecutive)]
visualize_word_positions(da)

In [None]:
#view da
da

In [None]:
# look at the distribution of similarity scores in consecutive appearances
def plot_overall_sim_score(data):
    filtered_data = [x for x in data['Sim_Score']]
    plt.figure(figsize=(8, 6))
    plt.hist(filtered_data, bins=10, edgecolor='black', alpha=0.7)
    plt.title("distribution of word confidence")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

plot_overall_sim_score(da)

In [None]:
# look at the relationship between similarity score and the length of the conversation that followed the interaction 
da['Convo_After'] = da.apply(lambda row: row['Convo_Length'] - row['Word_Pos'][-1], axis=1)
plt.figure(figsize=(8, 6))
plt.scatter(da['Sim_Score'], da['Convo_After'], alpha=0.7)
plt.title('Scatter Plot of Sim_Score vs. Convo_After')
plt.xlabel('Sim_Score')
plt.ylabel('Convo_After')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
# look at the distribution of POS similarity scores in consecutive appearances
def plot_overall_possim_score(data):
    filtered_data = [x for x in data['POS_Sim_Score']]
    plt.figure(figsize=(8, 6))
    plt.hist(filtered_data, bins=10, edgecolor='black', alpha=0.7)
    plt.title("distribution of word confidence by POS")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

plot_overall_possim_score(da)

In [None]:
# look at the relationship between pos similarity score and the length of the conversation that followed the interaction 
da['Convo_After'] = da.apply(lambda row: row['Convo_Length'] - row['Word_Pos'][-1], axis=1)
plt.figure(figsize=(8, 6))
plt.scatter(da['POS_Sim_Score'], da['Convo_After'], alpha=0.7)
plt.title('Scatter Plot of POS_Sim_Score vs. Convo_After')
plt.xlabel('POS_Sim_Score')
plt.ylabel('Convo_After')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()