#### LIBRARIES

In [1]:
!pip install contractions unidecode nltk scikit-learn scipy plotly

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pac

In [2]:
### ALL NECESSARY LIBRARIES ###
import pandas as pd
import numpy as np
import re
import pickle
import os

import contractions
from unidecode import unidecode

import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.stats import entropy


import plotly.express as px
import plotly.graph_objects as go


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#### FUNCTIONS

In [3]:
def clean_response(answer):

    answer = unidecode(answer)                      # remove non-ascii characters
    answer = answer.lower()                         # convert to lowercase
    answer = contractions.fix(answer)               # remove contractions
    # answer = re.sub(r'[^a-zA-Z\s]', '', answer)     # remove special characters
    # answer = re.sub(r'\s+', ' ', answer)            # remove extra whitespaces
    answer = re.sub(r'/', ', ', answer)
    # # remove colons and semicolons
    # answer = re.sub(r'[:;]', '', answer)
    # # remove punctuations including parentheses
    # answer = re.sub(r'[^\w\s]', '', answer)

    # entries = list(answer.split(','))
    # entries = [entry.strip() for entry in entries]      # remove leading and trailing whitespaces

    return answer

In [4]:
def clean_response_text(answer):

    answer = unidecode(answer)                      # remove non-ascii characters
    answer = answer.lower()                         # convert to lowercase
    answer = contractions.fix(answer)               # remove contractions
    # answer = re.sub(r'[^a-zA-Z\s]', '', answer)     # remove special characters
    # answer = re.sub(r'\s+', ' ', answer)            # remove extra whitespaces
    answer = re.sub(r'/', ' or ', answer)
    # # remove colons and semicolons
    answer = re.sub(r'[:;]', '', answer)
    # remove punctuations
    answer = re.sub(r'[^\w\s]', ' ', answer)
    # remove parentheses
    answer = re.sub(r'\(.*?\)', '', answer)

    return answer

# Map POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    # Map POS tag to first character used by WordNetLemmatizer
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if not found

# PREPROCESSING FUNCTION TO GET TOKENS FROM PHRASES (SAME AS THAT IN THE FEATURE CLUSTERING NOTEBOOK...)
# NEED TO DO POS-TAGGING FOR PROPER LEMMATIZATION
# THIS FUNCTION IS ALSO NEEDED TO COMPUTE GENSIM COHERENCE DOWNSTREAM
def lemmatize_text(text, add_stopwords=None):

    tokens = word_tokenize(text.lower())    # Convert to lowercase, tokenize
    pos_tags = nltk.pos_tag(tokens)         # Get POS tags

    remove_words = ['', 'etc', 'feel', 'felt', 'feeling', 'seem', 'sense', 'sens', 'uh', 'um']
    lemmatizer = WordNetLemmatizer()    # Lemmatize the tokens
    stemmer = PorterStemmer()     ### maybe try stemming instead of lemmatization

    # # STEMMING?
    # tokens = [stemmer.stem(contractions.fix(unidecode(token)), get_wordnet_pos(tag))
    #           for token, tag in pos_tags
    #           if not (token == 'like' and tag == 'IN')]         # Remove like as a preposition (e.g. "feel like")

    # LEMMATIZE?
    tokens = [lemmatizer.lemmatize(contractions.fix(unidecode(token)), get_wordnet_pos(tag))
              for token, tag in pos_tags
              if not (token == 'like' and tag == 'IN')]         # Remove like as a preposition (e.g. "feel like")

    tokens = [token for token in tokens if token not in remove_words]

    preprocessed_text = ' '.join(tokens)     # Reconstruct the text from the preprocessed tokens

    return preprocessed_text

def remove_stop_words(text, add_stopwords=None):
    stop_words = stopwords.words('english')
    stop_words.extend(['', 'etc', 'feel', 'felt', 'feeling', 'seem', 'sense', 'sens'])
    if add_stopwords is not None:
        stop_words.extend(add_stopwords)

    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [5]:
def clean_and_preprocess(df, column):
    df[f'{column}Cleaned'] = df[column].map(lambda row: clean_response_text(row))
    df[f'{column}Preprocessed'] = df[f'{column}Cleaned'].map(lambda row: [remove_stop_words(lemmatize_text(row))])
    df[f'{column}PreprocessedText'] = df[f'{column}Preprocessed'].map(lambda row: ' '.join(row))
    return df

def tokenize_text(df, column):
    tokenizer = RegexpTokenizer(r'\b[\w-]+\b')
    all_tokens = [tokenizer.tokenize(phrase.lower()) for phrase in df[column].values]
    all_tokens = list(set([token for sublist in all_tokens for token in sublist]))
    return all_tokens

def create_bow_matrix(df, column, tokens):
    count_vect = CountVectorizer(vocabulary=tokens, ngram_range=(1, 3), token_pattern=r"(?u)\b\w[\w-]*\w\b")
    bow_matrix = count_vect.fit_transform(df[column])
    return bow_matrix, count_vect

def calculate_word_frequencies(bow_matrix, count_vect):
    word_frequencies = np.transpose(bow_matrix.sum(axis=0))
    frequency_df = pd.DataFrame(columns=['feature', 'frequency'])
    frequency_df['feature'] = count_vect.get_feature_names_out()
    frequency_df['frequency'] = word_frequencies
    return frequency_df.sort_values(by='frequency', ascending=False)

def process_dataframe(df, column):
    df = clean_and_preprocess(df, column)
    tokens = tokenize_text(df, f'{column}PreprocessedText')
    bow_matrix, count_vect = create_bow_matrix(df, f'{column}PreprocessedText', tokens)
    frequency_df = calculate_word_frequencies(bow_matrix, count_vect)
    return frequency_df, tokens, df

def merge_and_plot_frequencies(original_df, synthetic_df, original_column, synthetic_column, title='Original vs. Synthetic', labels=['Original', 'Synthetic']):
    original_df, original_tokens, original_df = process_dataframe(original_df, original_column)
    synthetic_df, synthetic_tokens, synthetic_df = process_dataframe(synthetic_df, synthetic_column)

    all_tokens = list(set(original_tokens).union(set(synthetic_tokens)))
    all_count_vect = CountVectorizer(vocabulary=all_tokens, ngram_range=(1, 3), token_pattern=r"(?u)\b\w[\w-]*\w\b")

    original_bow_matrix_all = all_count_vect.fit_transform(original_df[f'{original_column}PreprocessedText'])
    synthetic_bow_matrix_all = all_count_vect.fit_transform(synthetic_df[f'{synthetic_column}PreprocessedText'])

    original_freq_df_all = calculate_word_frequencies(original_bow_matrix_all, all_count_vect)
    synthetic_freq_df_all = calculate_word_frequencies(synthetic_bow_matrix_all, all_count_vect)

    synthetic_freq_df_all = synthetic_freq_df_all.set_index('feature').reindex(original_freq_df_all['feature']).reset_index()

    combined_df = pd.merge(original_freq_df_all, synthetic_freq_df_all, on='feature', how='outer', suffixes=('_original', '_synthetic'))
    combined_df = combined_df.set_index('feature').reindex(original_freq_df_all['feature']).reset_index()

    # Create the figure
    fig = go.Figure()

    # Add the original data as a bar trace
    fig.add_trace(go.Bar(
        x=combined_df[:100]['feature'],
        y=combined_df[:100]['frequency_original'],
        opacity=0.5,
        name=labels[0]
    ))

    # Add the synthetic data as a bar trace
    fig.add_trace(go.Bar(
        x=combined_df[:100]['feature'],
        y=combined_df[:100]['frequency_synthetic'],
        opacity=0.5,
        name=labels[1]
    ))

    # Update the layout to overlay the bars, set axis titles, position the legend, set font size, and remove x-axis tick labels
    fig.update_layout(
        barmode='overlay',
        yaxis_title='Frequency',
        xaxis=dict(
            title='Word',
            showticklabels=False,
            title_font=dict(size=18),
            title_standoff=20,
            side='bottom'
        ),
        legend=dict(
            x=0.9,
            y=0.9,
            xanchor='right',
            yanchor='top',
            font=dict(size=18)
        ),
        font=dict(size=18)
    )

    # Remove the title by setting it to an empty string
    fig.update_layout(title_text='')

    return fig, original_freq_df_all, synthetic_freq_df_all

def calculate_kl_divergence(original_freq_df_all, synthetic_freq_df_all):
    min_threshold = 1e-10

    hist1 = np.array(original_freq_df_all['frequency'].values)
    hist2 = np.array(synthetic_freq_df_all['frequency'].values)

    prob_dist1 = hist1 / np.sum(hist1)
    prob_dist2 = hist2 / np.sum(hist2)

    prob_dist1 = np.clip(prob_dist1, min_threshold, None)
    prob_dist2 = np.clip(prob_dist2, min_threshold, None)

    prob_dist1 /= np.sum(prob_dist1)
    prob_dist2 /= np.sum(prob_dist2)

    kl_div = entropy(prob_dist1, prob_dist2)
    # print(f'KL Divergence: {kl_div}')

    return kl_div

#### IMPORT DATA

In [6]:
synthetic_df = pd.read_csv('Data/syntheticSynopsisAndSentiment.csv')
# concatenate Sentiment and Synopsis into one text column
synthetic_df['SentimentSynopsis'] = synthetic_df['Sentiment'] + ' ' + synthetic_df['Synopsis']
synthetic_df['SentimentSynopsis'].head()

0    Positive emotions are evident regarding apprec...
1    Positive sentiments arise regarding love for M...
2    Positive emotions emerge infrequently related ...
3    Patient shows mixed emotions: generally conten...
4    Patient demonstrates mild depression: generall...
Name: SentimentSynopsis, dtype: object

In [7]:
original_df = pd.read_csv('Data/synopsisAndSentiment.csv')
original_df['SentimentSynopsis'] = original_df['Sentiment'] + ' ' + original_df['Synopsis']
original_df['SentimentSynopsis'].head()

0    Positive emotions are expressed regarding love...
1    Patient exhibits mixed emotions: generally pos...
2    The patient's emotions range widely: nostalgia...
3    The patient's responses exhibit mixed emotions...
4    The patient's responses reveal mixed sentiment...
Name: SentimentSynopsis, dtype: object

In [8]:
llama_df = pd.read_csv('Data/DAIC_transcript_test_toks_all.csv')
# llama_df.head()
llama_df['SentimentSynopsis'] = llama_df['Sentiment'] + ' ' + llama_df['Synopsis']

#### PREPROCESSING

#### KL divergence of word freq distributions

In [9]:
fig, llama_freq_df_all, original_freq_df_all = merge_and_plot_frequencies(original_df,
                                                                            llama_df,
                                                                            'SentimentSynopsis',
                                                                            'SentimentSynopsis',
                                                                            title='Original vs. Llama Synthetic',
                                                                            labels=['Original', 'Llama Synthetic'])
fig.show()

kldiv = calculate_kl_divergence(original_freq_df_all, llama_freq_df_all)
print(f'KL Divergence between original and llama synthetic synopsis + sentiment: {kldiv}')

KL Divergence between original and llama synthetic synopsis + sentiment: 3.0652901952750127


In [10]:
fig, original_freq_df_all, synthetic_freq_df_all = merge_and_plot_frequencies(original_df,
                                                                            synthetic_df,
                                                                            'SentimentSynopsis',
                                                                            'SentimentSynopsis',
                                                                            title='Original vs. Synthetic',
                                                                            labels=['Original', 'GPT-4o Synthetic'])
fig.show()

kldiv = calculate_kl_divergence(original_freq_df_all, synthetic_freq_df_all)
print(f'KL Divergence between original and synthetic synopsis + sentiment: {kldiv}')

KL Divergence between original and synthetic synopsis + sentiment: 1.260516279730855


In [11]:
# within groups split half 50 times within synthetic data

iterations = 50

### SPLIT SYNTHETIC DATA INTO TWO HALVES AND CALCULATE KL DIVERGENCE
synthetic_kl_divergences = []
for i in range(iterations):
    synthetic_df_half1 = synthetic_df.sample(frac=0.5)
    synthetic_df_half2 = synthetic_df.drop(synthetic_df_half1.index)

    syn_fig, synthetic_half1, synthetic_half2 = merge_and_plot_frequencies(synthetic_df_half1,
                                                                    synthetic_df_half2,
                                                                    'SentimentSynopsis',
                                                                    'SentimentSynopsis',
                                                                    title='Synthetic vs. Synthetic (split half)',
                                                                    labels=['Synthetic', 'Synthetic (2nd half)'])
    kl_div = calculate_kl_divergence(synthetic_half1, synthetic_half2)
    synthetic_kl_divergences.append(kl_div)

syn_fig.show()
synthetic_kl_divergences = np.array(synthetic_kl_divergences)
print(f'Mean KL Divergence for split-half synthetic synopsis + sentiment: {np.mean(synthetic_kl_divergences)}')

### SPLIT ORIGINAL DATA INTO TWO HALVES AND CALCULATE KL DIVERGENCE
original_kl_divergences = []
for i in range(iterations):
    original_df_half1 = original_df.sample(frac=0.5)
    original_df_half2 = original_df.drop(original_df_half1.index)

    og_fig, original_half1, original_half2 = merge_and_plot_frequencies(original_df_half1,
                                                                original_df_half2,
                                                                'SentimentSynopsis',
                                                                'SentimentSynopsis',
                                                                title='Original vs. Original (split half)',
                                                                labels=['Original', 'Original (2nd half)'])
    kl_div = calculate_kl_divergence(original_half1, original_half2)
    original_kl_divergences.append(kl_div)

og_fig.show()
original_kl_divergences = np.array(original_kl_divergences)
print(f'Mean KL Divergence for split-half original synopsis + sentiment: {np.mean(original_kl_divergences)}')


Mean KL Divergence for split-half synthetic synopsis + sentiment: 0.7671219530066098


Mean KL Divergence for split-half original synopsis + sentiment: 1.6292090995746547
