In [9]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import tiktoken
import re
import time
from tqdm import tqdm
import ast
import json
import io
import numpy as np
from numpy.linalg import norm

In [None]:
### Prompt

In [None]:
def split_into_n_lists(full_list, n):
    # Calculate length of each sublist
    total_len = len(full_list)
    base_size = total_len // n
    remainder = total_len % n
    
    # Intialize lists and start
    lists = []
    start = 0
    
    # Create each sublist
    for i in range(n):
        end = start + base_size + (1 if i < remainder else 0)
        lists.append(full_list[start:end])
        start = end
        
    return lists

In [None]:
def convert_dict_to_dataframe(data_dict):

    # Create columns based on similarity score
    columns = ['word1', 'word2'] + [f'similarity_score_{i+1}' for i in range(len(next(iter(data_dict.values()))))]
    data_list = []
    
    # Populate data list for DataFrame creation
    for (word1, word2), scores in data_dict.items():
        data_list.append([word1, word2] + scores)
    
    # Create DataFrame
    df = pd.DataFrame(data_list, columns=columns)
    return df

In [11]:
def format_prompt(pairs, prompt):
    formatted_pairs = ', '.join([f"('{p[0]}', '{p[1]}')" for p in pairs])
    # return f"{prompt}\n\n---\n\n{formatted_pairs}"
    return f"{prompt} --- {[formatted_pairs]}"

In [12]:
def print_prompts(chunks, prompt):
    for chunk in chunks:
        print(format_prompt(chunk, prompt))

In [None]:
def print_prompts_single(chunks, sample_size, prompt):
    for chunk in chunks:
        for _ in range(sample_size):
            # Extract word pair from chunk
            word_pair = chunk[0]

            # Format prompt
            formatted_message = prompt.format(word1=word_pair[0], word2=word_pair[1])
            print(formatted_message)

In [None]:
def count_tokens_with_tiktoken(chunks, prompt):
    token_counts = []
    for chunk in chunks:
        formatted_prompt = format_prompt(chunk, prompt) # Format prompt
        tokens = encoding.encode(formatted_prompt) # Tokenize formatted prompt
        token_counts.append(len(tokens)) # Count tokens and add to list
    
    return token_counts

In [None]:
def count_tokens_with_tiktoken_single(chunks, prompt):
    token_counts = []
    for chunk in chunks:
        word_pair = chunk[0]
        formatted_prompt = prompt.format(word1=word_pair[0], word2=word_pair[1]) # Format prompt
        tokens = encoding.encode(formatted_prompt) # Tokenize formatted prompt
        token_counts.append(len(tokens)) # Count tokens and add to list
    
    return token_counts

In [None]:
def get_responses(chunks, prompt, model, sample_size, delay):
    # List to store responses
    responses = [] 

    # Start timing total processing
    start_time_total = time.time()

    # Set up progress bar
    total_iterations = len(chunks) * sample_size
    pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

    # Collect responses for each chunk and sample
    for chunk in chunks:
        for _ in range(sample_size):
            # Format prompts
            formatted_prompt = format_prompt(chunk, prompt)
            messages = [{"role": "user", "content": formatted_prompt}]

            # Call API
            completion = client.chat.completions.create(
                model=model,
                messages=messages,
                n=1,
                stop=None)

            # Store response content
            responses.append(completion.choices[0].message.content)

            # Update progress bar after each sample
            pbar.update(1)

            # Delay after each API call
            time.sleep(delay)

    # Close progress bar
    pbar.close()

    # Total processing time
    end_time_total = time.time()
    print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

    return responses

In [None]:
def get_responses_conversational(chunks, prompt, user_content, assistant_content, model, sample_size, delay):
    # List to store responses
    responses = []

    # Start timing total processing
    start_time_total = time.time()

    # Set up progress bar
    total_iterations = len(chunks) * sample_size
    pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

    # Collect responses for each chunk and sample
    for chunk in chunks:
        for _ in range(sample_size):
            # Format prompts
            messages = [
                {"role": "system", "content": prompt},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content},
                {"role": "user", "content": f'"{chunk}"'},
                ]

            # Call API
            completion = client.chat.completions.create(
                model=model,
                messages=messages,
                n=1,
                stop=None)

            # Store response content
            responses.append(completion.choices[0].message.content)

            # Update progress bar after each sample
            pbar.update(1)

            # Delay after each API call
            time.sleep(delay)

    # Close progress bar
    pbar.close()

    # Total processing time
    end_time_total = time.time()
    print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

    return responses

In [None]:
def get_responses_single(prompt, chunks, model, sample_size, delay):
    # List to store responses
    responses = [] 

    # Start timing total processing
    start_time_total = time.time()

    # Set up progress bar
    total_iterations = len(chunks) * sample_size
    pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

    # Collect responses for each chunk and sample
    for chunk in chunks:
        for _ in range(sample_size):
            # Extract word pair from chunk
            word_pair = chunk[0]

             # Format prompts
            formatted_message = prompt.format(word1=word_pair[0], word2=word_pair[1])
            messages = [{"role": "user", "content": formatted_message}]

            # Call API
            completion = client.chat.completions.create(
                model=model,
                messages=messages,
                n=1,
                stop=None)

            # Store response content
            responses.append(completion.choices[0].message.content)

            # Update progress bar after each sample
            pbar.update(1)

            # Delay after each API call
            time.sleep(delay)

    # Close progress bar
    pbar.close()

    # Total processing time
    end_time_total = time.time()
    print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

    return responses

In [None]:
# def get_responses_instruct(chunks, prompt, model, sample_size, delay):
#     # List to store responses
#     responses = [] 

#     # Start timing total processing
#     start_time_total = time.time()

#     # Set up progress bar
#     total_iterations = len(chunks) * sample_size
#     pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

#     # Collect responses for each chunk and sample
#     for chunk in chunks:
#         for _ in range(sample_size):
#             # Format the prompts
#             formatted_prompt = format_prompt(chunk, prompt)

#             # Call API
#             completion = client.completions.create(
#                 model=model,
#                 prompt=formatted_prompt,
#                 n=1,
#                 max_tokens=3000)

#             # Store response content
#             responses.append(completion.choices[0].text)

#             # Update progress bar after each sample
#             pbar.update(1)

#             # Delay after each API call
#             time.sleep(delay)

#     # Close progress bar
#     pbar.close()

#     # Total processing time
#     end_time_total = time.time()
#     print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

#     return responses

In [None]:
# gpt-3.5-turbo-0125

def process_responses(responses): 
    data_dict = {}

    # Define regex pattern
    pattern = r"\(\s*'([^']+)'?\s*(?:,\s*)?'([^']+\s?[^']*?)'?\s*,\s*(['\"]?[\d,\.]+['\"]?)\s*\)" 
    # pattern = r"\(\s*'([^']+)'?\s*,\s*'([^']+\s?[^']*?)'?\s*,\s*(['\"]?[\d,\.]+['\"]?)\s*\)"

    # Normalize response to handle inconsistent formatting
    for response in responses:
        normalized_response = response.replace("\n", "").replace("), ", "),")
        matches = re.findall(pattern, normalized_response)
        
        # Concatenate words separated by whitespace
        for word1, word2, score in matches:
            word1 = word1.replace(" ", "")
            word2 = word2.replace(" ", "")

            # Remove quotes if the score is captured as a quoted string
            if score.startswith(("'", '"')) and score.endswith(("'", '"')):
                score = score[1:-1]

            # Replace comma with dot in the score string to handle decimal numbers
            score = score.replace(",", ".")

            # Ensure no trailing non-numeric characters
            score = re.sub(r'[^\d.]', '', score)
            key = (word1, word2)

            # Convert score to float and add to dictionary
            if key not in data_dict:
                data_dict[key] = []
            data_dict[key].append(float(score))

    return data_dict

In [None]:
def process_responses_categorical(responses):
    data_dict = {}

    # Define regex pattern
    # pattern = r"\(\s*'([^']*)'\s*,\s*'([^']*)'\s*,\s*'([^']*)'\s*\)"
    pattern = r"\(\s*\'?([^']+?)\'?\s*,\s*\'?([^']+?)\'?\s*,\s*\'?([^']+?)\'?\s*\)"

    # Normalize response to handle inconsistent formatting
    for response in responses:
        normalized_response = response.replace("\n", "").replace("), ", "),")
        matches = re.findall(pattern, normalized_response)

        # Concatenate words separated by whitespace
        for word1, word2, label in matches:
            word1 = word1.replace(" ", "")
            word2 = word2.replace(" ", "")

            # Create a key from the two words
            key = (word1, word2)

            # Initialize list for each key
            if key not in data_dict:
                data_dict[key] = []

            # Append label string to the list associated with the key
            data_dict[key].append(label)

    return data_dict

In [None]:
# def process_responses_instruct(responses):
#     data_dict = {}

#     # Define regex pattern
#     pattern = r"\(\s*'?\"?(\w+)'?\"?\s*,\s*'?\"?(\w+)'?\"?\s*,\s*'?\"?([\d\.]+)'?\"?\s*\)" # gpt-3.5-turbo-instruct

#     # Normalize response to handle inconsistent formatting
#     for response in responses:
#         normalized_response = response.replace("\n", "").replace("), ", "),")
#         matches = re.findall(pattern, normalized_response)

#         # Concatenate words separated by whitespace
#         for word1, word2, score in matches:
#             word1 = word1.replace(" ", "")
#             word2 = word2.replace(" ", "")

#             # Remove quotes if the score is captured as a quoted string
#             if score.startswith(("'", '"')) and score.endswith(("'", '"')):
#                 score = score[1:-1]

#             # Replace comma with dot in the score string to handle decimal numbers
#             score = score.replace(",", ".")
            
#             # Ensure no trailing non-numeric characters
#             score = re.sub(r'[^\d.]', '', score)
#             key = (word1, word2)

#             # Convert score to float and add to dictionary
#             if key not in data_dict:
#                 data_dict[key] = []
#             data_dict[key].append(float(score))

#     return data_dict

In [5]:
def print_duplicate_word_pairs(cleaned_nl_simlex, data_dict):
    ### Original dataframe
    # Copy the original dataframe
    combined_cleaned_nl_simlex = cleaned_nl_simlex.copy()

    # Remove spaces from 'word1' and 'word2' columns
    combined_cleaned_nl_simlex['word1'] = combined_cleaned_nl_simlex['word1'].replace(" ", "", regex=True)
    combined_cleaned_nl_simlex['word2'] = combined_cleaned_nl_simlex['word2'].replace(" ", "", regex=True)

    # Create a new combined column and drop other columns
    combined_cleaned_nl_simlex['Combined_Columns'] = combined_cleaned_nl_simlex['word1'] + '_' + combined_cleaned_nl_simlex['word2']
    combined_cleaned_nl_simlex = combined_cleaned_nl_simlex[['Combined_Columns']]

    ### Extracted dataframe
    # Convert dictionary to dataframe
    df_combined = create_dataframe(data_dict)

    # Create a new combined column and drop other columns
    df_combined['Combined_Columns'] = df_combined['word1'] + '_' + df_combined['word2']
    df_combined = df_combined[['Combined_Columns']]

    ### Check for duplicates
    # Find values in df1 that are not in df2
    missing_values = combined_cleaned_nl_simlex[~combined_cleaned_nl_simlex['Combined_Columns'].isin(df_combined['Combined_Columns'])]
    print(missing_values)

    # Check for duplicate word pairs
    duplicate_combinations = df_combined.duplicated(subset='Combined_Columns', keep=False)

    # Print rows with duplicate word pairs
    print(df_combined[duplicate_combinations])

In [None]:
def create_dataframe(data_dict):
    # Determine the max number of scores for any word pair
    max_scores = max(len(scores) for scores in data_dict.values())

    # Create DataFrame columns based on the max number of scores collected
    columns = ['word1', 'word2'] + [f'similarity_score_{i+1}' for i in range(max_scores)]
    data_list = []
    
    # Populate data list for DataFrame creation
    for (word1, word2), scores in data_dict.items():
        
        # Ensure each row has the same number of elements by filling missing scores with None
        full_scores = scores + [None] * (max_scores - len(scores))
        data_list.append([word1, word2] + full_scores)
    
    # Create DataFrame
    return pd.DataFrame(data_list, columns=columns)

In [None]:
### Embeddings

In [None]:
def get_embedding_as_numpy_array(word, model):
    response = client.embeddings.create(
        input=word,
        model=model
    )
    return np.array(response.data[0].embedding)

In [None]:
def calculate_and_scale_cosine_similarity(a, b):
    cosine_similarity = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) # Euclidean norms

    return cosine_similarity

In [None]:
def get_responses_embeddings(word_pairs, model, delay):
    # List to store responses
    response = []

    # Start timing total processing
    start_time_total = time.time()

    # Set up progress bar
    total_iterations = len(word_pairs)
    pbar = tqdm(total=total_iterations, desc="Calculating Cosine Similarities", unit="pair")

    # Collect responses for each word pair
    for word1, word2 in word_pairs:

        # Get word pair embeddings
        vec_embedding1 = get_embedding_as_numpy_array(word1, model)
        vec_embedding2 = get_embedding_as_numpy_array(word2, model)

        # Calculate cosine similarity score between embeddings
        cosine_similarity = calculate_and_scale_cosine_similarity(vec_embedding1, vec_embedding2)

        # Append results to response list
        response.append((word1, word2, cosine_similarity))

        # Update progress bar
        pbar.update(1)

        # Delay after each API call
        time.sleep(delay)

    # Close progress bar
    pbar.close()

    # Total processing time
    end_time_total = time.time()
    print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

    return response

In [None]:
def process_responses_embeddings(response):
    # Create empty Pandas DataFrame
    columns = ['word1', 'word2', 'similarity_score']
    results_df = pd.DataFrame(columns=columns)

    # Populate Pandas DataFrame
    for word1, word2, similarity in response:
        row = {'word1': word1, 'word2': word2, 'similarity_score': round(similarity, 2)} # round 2 decimals
        results_df = results_df.append(row, ignore_index=True)
    
    return results_df