In [None]:
#using word embeddings and cosine similarity and choose the one with the higest similarity as a pun

import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import math
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


# IDF
def calculate_idf(corpus, word):
    # Calculate the number of documents containing words
    doc_count = sum(1 for doc in corpus if word in doc)
    # Total number of documents
    total_docs = len(corpus)
    # Calculating IDF
    idf = math.log(total_docs / (doc_count + 1))
    return idf

# NPMI
def calculate_NPMI(word1, word2, words, span_size=20):
    def calculate_p(x):
        return words.count(x) / len(words)

    f_x = words.index(word1)
    f_y = words.index(word2)
    p_x = calculate_p(word1)
    p_y = calculate_p(word2)

    p_xy = 0
    for i in range(max(0, f_x - span_size), min(len(words), f_x + span_size + 1)):
        if words[i] == word2:
            p_xy += 1
    p_xy /= len(words)

    npmi = (math.log(p_xy / (p_x * p_y)) - math.log(p_xy)) / (-math.log(p_xy))

    return npmi

def find_pun_location(s):
    s = s.lower()
    tokens = nltk.word_tokenize(s)
    t = [word for word in tokens if word not in stop_words]

    embeddings = {}

    for word in t:
        embeddings[word] = np.random.rand(300) # embedding dimension = 300

    k, b = -1, float('inf')
    wk = None

    for i in range(len(t)):
        ei = embeddings[t[i]]

        for j in range(i + 1, len(t)):
            ej = embeddings[t[j]]
            d = cosine_similarity([ei], [ej])[0][0]

            if d < b:
                b, k = d, j

    if k > -1:
        wk = t[k]

    else:
        # IDF
        corpus = [t]
        idf_scores = {}

        for word in t:
            if word not in stop_words:
                idf_scores[word] = calculate_idf(corpus, word)

        # NPMI
        best_pun_word = None
        best_score = -1

        for i in range(len(t)):
            if t[i] not in stop_words:
                score = 0
                for j in range(len(t)):
                    if i != j:
                        npmi_score = calculate_NPMI(t[i], t[j], t)
                        score += npmi_score
                if score > best_score:
                    best_score = score
                    best_pun_word = t[i]

        wk = best_pun_word

    return wk


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:


from google.colab import drive
drive.mount('/content/drive')

import json

json_file_path = '/content/drive/MyDrive/Colab Notebooks/semeval-task3-homo.json'  # JSON 파일 경로를 지정해야 합니다.

sentences = []
puns = []
predicted =[]

with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)

for item in data:
    sentence = item.get('sentence')
    pun = item.get('src')
    sentences.append(sentence)
    puns.append(pun)

for sentence in sentences:
    pun_word = find_pun_location(sentence)
    predicted.append(pun_word)

total_puns = len(puns)
total_predicted = len(predicted)

if total_puns != total_predicted:
    print("Error: Different length")
else:

    correct_count = sum(1 for i in range(total_puns) if puns[i] == predicted[i])
    accuracy = (correct_count / total_puns) * 100
    print(f"Accuracy: {accuracy:.2f}%")


Mounted at /content/drive
Accuracy: 20.42%
