In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix
import numpy as np

In [None]:
import json
import requests

dataset = []
file = requests.get('https://github.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection/raw/master/Sarcasm_Headlines_Dataset.json').content.decode("utf-8")
for l in file.split("\n"):
  if l:
    dataset.append(json.loads(l))
dataset = pd.DataFrame(dataset)

In [None]:
dataset.sample(5)

In [None]:
len(dataset)

# Ejemplos

In [None]:
dataset[dataset['is_sarcastic'] == 1]['headline'].sample(10).tolist()

In [None]:
dataset[dataset['is_sarcastic'] == 0]['headline'].sample(10).tolist()

In [None]:
dataset = dataset.sample(len(dataset))

#10% for testing
train = dataset.iloc[:9 * (len(dataset)//10)]
test = dataset.iloc[9 * (len(dataset)//10):]

# Cargo embeddings

In [None]:
from gensim.models import KeyedVectors
from gensim import downloader

downloader.load('word2vec-google-news-300')
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

Nos quedamos solo con las palabras útiles en un set

In [None]:
import re

TOKENIZER_REGEX = r"\w+(?:'\w+)?|[^\w\s]"

palabras_utiles = set()

for text in train['headline'].tolist():
  palabras_utiles.update(re.findall(TOKENIZER_REGEX, text.lower()))

In [None]:
len(palabras_utiles)

Extraemos los embeddings

In [None]:
key_to_index = {}
embs = []

for i in range(len(model)):
  # Para ahorrar RAM, solo nos quedamos con el vector si es parte de las palabras utiles
  if model.index_to_key[i] in palabras_utiles:
    key_to_index[model.index_to_key[i]] = len(embs)
    embs.append(model[model.index_to_key[i]])

In [None]:
embs = np.asarray(embs)

In [None]:
embs.shape

In [None]:
#ahorramos ram
model = None
palabras_utiles = None

Creamos un vector para la palabra deconocida usando el promedio de todas

In [None]:
unknown_emb = embs.mean(axis=0)

In [None]:
unknown_emb.shape

Creamos la matriz de embeddings, en el indice 0 contiene a un vector de 0s para indicar que no hay palabras, en el indice 1 contiene al vector de palabra desconocida y en el resto las palabras del embedding.

In [None]:
embs = np.concatenate([np.zeros((1,300)), unknown_emb.reshape((1,300)), embs], axis=0)
embs.shape

Tokenizamos todos los textos y transformamos cada palabra en índice de la matriz

In [None]:
X_train = []

for text in train['headline'].tolist():
  tokens = re.findall(TOKENIZER_REGEX, text.lower())
  # Sumamos 2 por el vector de 0s y el de unknown
  tokens = [key_to_index[t] + 2 if t in key_to_index else 1 for t in tokens]
  X_train.append(tokens)

In [None]:
X_test = []

for text in test['headline'].tolist():
  tokens = re.findall(TOKENIZER_REGEX, text.lower())
  # Sumamos 2 por el vector de 0s y el de unknown
  tokens = [key_to_index[t] + 2 if t in key_to_index else 1 for t in tokens]
  X_test.append(tokens)

In [None]:
print(train['headline'].tolist()[0])
print(X_train[0])

Paddeamos los textos para que tengan todos el mismo largo

In [None]:
np.quantile([len(x) for x in X_train], 0.8)

In [None]:
from tensorflow.keras.utils import pad_sequences

In [None]:
X_train = pad_sequences(X_train,maxlen=15,padding='post',truncating='post',value=0)
X_test = pad_sequences(X_test,maxlen=15,padding='post',truncating='post',value=0)

X_train.shape, X_test.shape

In [None]:
y_train = train['is_sarcastic'].values.reshape((-1, 1))
y_test = test['is_sarcastic'].values.reshape((-1, 1))

y_train.shape, y_test.shape

In [None]:
from tensorflow.keras.layers import Input, Dense, GRU, Concatenate, Embedding, Masking
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
inp = Input((15,), dtype='int32')
# Ponemos que los embeddings no sean entrenables
emb_layer = Embedding(input_dim=embs.shape[0], output_dim=embs.shape[1], weights=[embs], trainable=False)(inp)
# Masking para cuando el vector es todo 0s, hace que la lectura de la RNN frene
masking = Masking(0.0)(emb_layer)
# Hacemos que la primer GRU devuelva todos los estados internos, como una secuencia nueva
gru1 = GRU(64, return_sequences=True)(masking)
# La segunda GRU solo devuelve el último estado interno
gru2 = GRU(16)(gru1)
out = Dense(1, activation='sigmoid')(gru2)
model = Model(inputs=inp, outputs=out)

In [None]:
model.summary()

In [None]:
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(X_train, y_train, epochs=10, validation_split=0.1, callbacks=[early_stop])

In [None]:
preds = (model.predict(X_test) > 0.5).astype('int')

In [None]:
accuracy_score(y_test, preds)

# Verdaderos positivos

In [None]:
test['headline'].iloc[np.where((preds == 1) & (y_test == 1))[0]].sample(10).tolist()

# Verdaderos negativos

In [None]:
test['headline'].iloc[np.where((preds == 0) & (y_test == 0))[0]].sample(10).tolist()

# Falsos positivos

In [None]:
test['headline'].iloc[np.where((preds == 1) & (y_test == 0))[0]].sample(10).tolist()

# Falsos negativos

In [None]:
test['headline'].iloc[np.where((preds == 0) & (y_test == 1))[0]].sample(10).tolist()

# Ejemplo a mano

Tomamos algunos titulares de "reductress", un medio que publica sarcasticamente como si fuera una revista "para mujeres"

In [None]:
EJEMPLOS = [
    "How I Live in the Moment by Making Sure No One Has Texted Me Every Five Minutes",
    "Woman Insists She Only Drinks and Drives Socially",
    "Future Trans Person Really Enjoying Halloween",
    "Woman Gets Awesome Revenge on High School Bullies by Becoming Associate Account Manager",
    "Adult Woman Starting to Realize Why Mom Was Like That",
    "Woman Who Deleted One Social Media Platform Just Redistributing That Time Between All Others"
    ]

In [None]:
X_manual = []

for text in EJEMPLOS:
  tokens = re.findall(TOKENIZER_REGEX, text.lower())
  # Sumamos 2 por el vector de 0s y el de unknown
  tokens = [key_to_index[t] + 2 if t in key_to_index else 1 for t in tokens]
  X_manual.append(tokens)

X_manual = pad_sequences(X_manual,maxlen=15,padding='post',truncating='post',value=0)

In [None]:
for t, score in zip(EJEMPLOS, model.predict(X_manual)):
  print(f"\"{t}\" -> {score[0]}")

Tomamos titulares de la CNN de hoy

In [None]:
EJEMPLOS = [
    "Trump testifies in New York civil fraud trial",
    "The nonstop sparring match between the conservative justice and liberal Biden attorney at the Supreme Court",
    "Israel-Hamas war rages as outcry grows over Gaza crisis",
    "He left his White evangelical bubble. Here’s what he says it would take for others to do the same",
    "Two more companies recall cinnamon applesauce pouches due to potential lead contamination",
    "Katy Perry dedicates final Las Vegas show to daughter Daisy Dove "
    ]

In [None]:
X_manual = []

for text in EJEMPLOS:
  tokens = re.findall(TOKENIZER_REGEX, text.lower())
  # Sumamos 2 por el vector de 0s y el de unknown
  tokens = [key_to_index[t] + 2 if t in key_to_index else 1 for t in tokens]
  X_manual.append(tokens)

X_manual = pad_sequences(X_manual,maxlen=15,padding='post',truncating='post',value=0)

In [None]:
for t, score in zip(EJEMPLOS, model.predict(X_manual)):
  print(f"\"{t}\" -> {score[0]}")