<a href="https://colab.research.google.com/github/deckerkrogh/semeval-2024-10/blob/main/semeval_data_analysis_heuristic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt

In [77]:
#drive.mount('/content/drive')

# Publicly hosted SemEval data:
train_1_url = 'https://raw.githubusercontent.com/deckerkrogh/nlp243_data/main/datasets/task1_train.json'
train_2_url = 'https://raw.githubusercontent.com/deckerkrogh/nlp243_data/main/datasets/task2_train.json'
train_3_url = 'https://raw.githubusercontent.com/deckerkrogh/nlp243_data/main/datasets/task3_train.json'
train_2_data = pd.read_json(train_2_url)
train_3_data = pd.read_json(train_3_url)

In [78]:
#train_3_data.head()
train_2_data.head()

Unnamed: 0,episode,speakers,utterances,triggers,emotions
0,kavi.wmv,"[rosesh, sahil, indu, sahil, maya, monisha, sa...","[i hate him. i hate him mom., dad aap kyu eisa...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[anger, neutral, joy, neutral, anger, anger, n..."
1,finale.avi,"[maya, indu, maya, indu, maya, indu, maya, ind...",[indravardhan you are the limit hadd karte ho ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[disgust, anger, anger, neutral, anger, neutra..."
2,bachelor.wmv,"[indu, sahil, indu, sahil, rosesh, indu, sahil]","[ha rosesh, mila cockroach? hahahaha, are rose...","[0, 0, 0, 0, 1, 0, 0]","[neutral, neutral, anger, neutral, neutral, jo..."
3,lease1.wmv,"[monisha, sahil, monisha, sahil, monisha, sahi...","[theek se wajan karana, monisha, abhi ayi. yeh...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[neutral, neutral, anger, neutral, neutral, ne..."
4,sahillovestory.wmv,"[sahil, rosesh, sahil, rosesh, sahil, rosesh]",[mom aap aa rahi haina! so come fast i am wait...,"[0, 0, 0, 0, 1.0, 0]","[neutral, neutral, contempt, neutral, neutral,..."


In [103]:
def create_x_y(df):
    # Creates a list of feature dataframes and list of corresponding labels
    # Each element in both lists represents a conversation ("episode")

    X = []
    y = []

    for i, episode in df.iterrows():
        # Note: the emotion flip always occurs at the last utterance

        # Don't include triggers with NaN (7 of the elements in the training data)
        if None in episode["triggers"]:
            continue
        triggers = [int(float(t)) for t in episode['triggers']]
        y.append(np.array(triggers))

        episode_df = pd.DataFrame({
            'utterance': episode['utterances'],
            'emotions': episode['emotions'],
            #'triggers': episode['triggers']
            #'emotion_flip': last_flip_i
        })

        X.append(episode_df)

    return X, y

train_X, train_y = create_x_y(train_2_data)

ValueError: ignored

In [None]:
print(train_3_data["emotions"])

In [98]:
# NOTE: 'contempt' only used in MaSaC dataset
emotions = ['neutral', 'joy', 'fear', 'disgust', 'surprise', 'sadness', 'anger', 'contempt']

emotion_distrib = {e: 0 for e in emotions}
prev_trigger_distrib = {e: 0 for e in emotions}
self_trigger_distrib = {e: 0 for e in emotions}

num_self_trigger = 0
num_prev_trigger = 0
num_both_trigger = 0
for X, y in zip(train_X, train_y):
    target_emotion = X['emotions'].tolist()[-1]
    emotion_distrib[target_emotion] += 1
    print(type(y[-2]))
    if y[-1] == 1:
        num_self_trigger += 1
        self_trigger_distrib[target_emotion] += 1
    if len(y) <= 1:
        continue
    if y[-2] == 1:
        print('here')
        num_prev_trigger += 1
        prev_trigger_distrib[target_emotion] += 1
    if y[-1] == 1.0 and y[-2] == 1.0:
        num_both_trigger += 1
    break

prev_trigger_distrib = {e: f / emotion_distrib[e] for e, f in prev_trigger_distrib.items()}
self_trigger_distrib = {e: f / emotion_distrib[e] for e, f in self_trigger_distrib.items()}
print(f"Percent self trigger: {num_self_trigger/len(train_y)}")
print(f"Percent prev trigger: {num_prev_trigger/len(train_y)}")
print(f"Percent both self and prev trigger: {num_both_trigger/len(train_y)}")

fig, ax = plt.subplots()
ax.bar(*zip(*prev_trigger_distrib.items()))
ax.bar(*zip(*self_trigger_distrib.items()))
ax.legend(['Previous Trigger','Self Trigger'],loc='upper left')
ax.set_ylabel('Frequency')
ax.set_title('Trigger distribution over emotions in MaSaC data')
print(f'Previous trigger distribution across emotions: {prev_trigger_distrib}')
print(f'Self trigger distribution across emotions: {self_trigger_distrib}')
plt.show()


<class 'numpy.str_'>


ZeroDivisionError: ignored

In [23]:

def self_trigger_heur(episode):
    a = np.zeros(len(episode))
    a[-1] = 1.0
    return a

def prev_trigger_heur(episode):
    a = np.zeros(len(episode))
    a[-2] = 1.0
    return a

def eval(y_pred, y):
    scores = []
    for a,b in zip(y_pred,y):
        scores += mean_squared_error(a,b)
    scores = [f1_score(a, b) for a, b in zip(y_pred, y)]
    score = sum(scores) / len(scores)
    return score

y_pred = [prev_trigger_heur(episode) for episode in train_X]
print(f'score: {eval(y_pred, train_y)}')

[0.0, 0.0, 0.0, 0.6666666666666666, 1.0, 1.0, 1.0, 1.0, 0.5, 0.0]
score: 0.5765821904719534
