In [1]:
import json
import numpy as np
from scipy.special import softmax
from scipy.special import digamma

In [2]:
def topk(arr, k):
    indices = np.argpartition(arr, -k)[-k:]
    values = arr[indices]
    return values, indices

In [3]:
def process_emotion_data(file_path):
    """
    Processes emotion data from a JSON file and returns a list of processed entries.

    Parameters:
    file_path (str): Path to the JSON file.

    Returns:
    list: List of processed data entries.
    """
    processed_data = []

    # Open the file and read line by line
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            emotion_vector = np.array([data['Emotion_Vector'][emotion] for emotion in sorted(data['Emotion_Vector'])])
            emotion_logits = np.array([data['Emotion_Logits'][emotion] for emotion in sorted(data['Emotion_Logits'])])
            emotion_prob = softmax(emotion_logits)

            # Create a new entry with the original ID, Tweet, and processed arrays
            processed_entry = {
                'ID': data['ID'],
                'Tweet': data['Tweet'],
                'Emotion_Vector': emotion_vector,
                'Emotion_Logits': emotion_logits,
                'Emotion_Prob': emotion_prob  # New addition
            }
            processed_data.append(processed_entry)
    
    return processed_data


In [4]:
def get_u(mode="prob", k=None):
    if mode == "prob":
        def u(logits):
            logits = softmax(logits)
            top_k = 1
            if len(logits) < top_k:
                raise ValueError("Logits array length is less than top_k.")
            top_values, _ = topk(logits, top_k)
            mean_scores = top_k / (np.sum(np.maximum(0, top_values)) + top_k)
            return mean_scores
        return u

    elif mode == "entropy":
        def u(logits):
            probs = softmax(logits)
            entropy = -np.sum(probs * np.log(probs + 1e-10))
            return entropy
        return u

    elif mode =="LogTokU":
        def cal_au(alpha):
            alpha = np.array([alpha])
            alpha_0 = alpha.sum(axis=1, keepdims=True)
            psi_alpha_k_plus_1 = digamma(alpha + 1)
            psi_alpha_0_plus_1 = digamma(alpha_0 + 1)
            result = - (alpha / alpha_0) * (psi_alpha_k_plus_1 - psi_alpha_0_plus_1)
            result = result.sum(axis=1)
            return result
        def u(logits):
            top_k = k
            if len(logits) < top_k:
                raise ValueError("Logits array length is less than top_k.")
            top_values = np.partition(logits, -top_k)[-top_k:]
            au = cal_au(top_values)
            mean_scores = top_k / (np.sum(np.maximum(0, top_values)) + top_k)
            return mean_scores / au
        return u

    else:
        raise ValueError(f"Unsupported mode: {mode}")


In [5]:
def CalSecond(label_list, pred_sec, accumu):
    first_indices = pred_sec[0]
    second_indices = pred_sec[1]
    if label_list[first_indices] and label_list[second_indices]:
        accumu += 1
    elif label_list[first_indices] and not label_list[second_indices]:
        accumu -= 1
    elif not label_list[first_indices] and label_list[second_indices]:
        accumu -= 0
    else: 
        accumu -= 0
    return accumu

In [6]:
def Monopoly(label_list, pred_list):
    game_score = 0
    for pred in pred_list:
        if label_list[pred]:
            game_score += 1
        else:
            game_score -= 1
    return max(0, game_score)

In [7]:
def sort_processed_data_by_total_u(processed_data, u):
    '''Calculate the u value for each entry and store it together with the original entry in a list.'''
    data_with_u = [(entry, u(entry['Emotion_Logits'])) for entry in processed_data]
    sorted_data_with_u = sorted(data_with_u, key=lambda x: x[1])
    sorted_processed_data = [entry for entry, u_value in sorted_data_with_u]
    return sorted_processed_data

In [8]:
def dynamic_select(logits, vector, sorted_indices, t_opti, u):
    sample_u = u(logits)
    
    if sample_u > t_opti:
        choice = 1
    else:
        choice = 2
    return Monopoly(vector, sorted_indices[:choice])

In [9]:
def calculate_t_opti(sorted_processed_data, u):
    accumu_list = []
    accumu = 0
    for entry in sorted_processed_data:
        # Get sorted indices (ascending order), reverse for descending order
        sorted_indices = np.argsort(entry['Emotion_Logits'])[::-1]
        # Update accumu with the result of CalSecond
        accumu = CalSecond(entry['Emotion_Vector'], sorted_indices[:2], accumu)

        accumu_list.append(accumu)

    # Find the max value and its index in accumu_list
    max_value = max(accumu_list)
    max_idx = accumu_list.index(max_value)
    # Calculate and return the t_opti value
    t_opti = u(sorted_processed_data[max_idx]['Emotion_Logits'])
    return t_opti

In [11]:
path_dict = {"llama2-7b": './outputs/llama2_7b.jsonl'}
model_name = "llama2-7b"
file_path = path_dict[model_name]
processed_data = process_emotion_data(file_path)

In [12]:
# Define the dictionary for mode and k values
mode_k_dict = {
    "prob": None,
    "entropy": None,
    "LogTokU": 2
}

# Calculate the baseline score
baseline_score = len(processed_data)

# Initialize the result storage
result = []

# Calculate scores and score rates for each method
# Greedy Decoding
first_max_true_count = 0
second_max_true_count = 0
for entry in processed_data:
    sorted_indices = np.argsort(entry['Emotion_Logits'])[::-1]
    first_max_true_count += Monopoly(entry['Emotion_Vector'], sorted_indices[:1])
    second_max_true_count += Monopoly(entry['Emotion_Vector'], sorted_indices[:2])

greedy_score = first_max_true_count
greedy_score_rate = (greedy_score / baseline_score) * 100

# Top-2 Sampling
top2_score = second_max_true_count
top2_score_rate = (top2_score / baseline_score) * 100

# Iterate over the dictionary to assign values to get_u for other models
for mode, k in mode_k_dict.items():
    dynamic_baseline_score = 0
    u_function = get_u(mode, k)
    sorted_processed_data = sort_processed_data_by_total_u(processed_data, u_function)
    t_opti = calculate_t_opti(sorted_processed_data, u_function)
    
    for entry in processed_data:
        sorted_indices = np.argsort(entry['Emotion_Logits'])[::-1]
        dynamic_baseline_score += dynamic_select(entry['Emotion_Logits'], entry['Emotion_Vector'], sorted_indices, t_opti, u_function)
    
    dynamic_score_rate = (dynamic_baseline_score / baseline_score) * 100
    result.append([mode, dynamic_baseline_score, dynamic_score_rate])

# Add Greedy Decoding and Top-2 Sampling to the results list
result.insert(0, ['Greedy decoding', greedy_score, greedy_score_rate])
result.insert(1, ['Top-2 Sampling', top2_score, top2_score_rate])

# Print the results in the required format
print(f"{'Model Name':<15} {'Score':<10} {'Score Rate (%)':<15}")
print("-" * 40)
for model_name, score, score_rate in result:
    print(f"{model_name:<15} {score:<10} {score_rate:.4f}%")


Model Name      Score      Score Rate (%) 
----------------------------------------
Greedy decoding 2525       77.4778%
Top-2 Sampling  2520       77.3243%
prob            2558       78.4903%
entropy         2585       79.3188%
LogTokU         2831       86.8671%
