# Machine Learning - Word Embbeding

**Relator: Felipe Mesa Abraham** 

Correo: femesa@udec.cl

En este problema se realizará un análisis de sentimiento usando un set de datos de Twitter.
El set de datos se encuentra en el siguiente link.
https://www.kaggle.com/kazanova/sentiment140

Para este problema se usará Glove como un word embedding pre entrenado, luego se creará la matriz de embedding y se contruirá una red neural para clasificar sentimiento de frases.

Por ultimo se presentan resultados buscando sesgos en frases sexistas.

##Lectura y preprocesamiento de datos

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
import re
import os

In [None]:
columns = ['target', 'id', 'date', 'flag', 'user', 'text']
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/training.1600000.processed.noemoticon.csv', encoding='iso-8859-1', names=columns)

In [None]:
data = data[['text', 'target']]

In [None]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [None]:
data['target'] = data['target'].replace(4, 1)

In [None]:
data.head()

Unnamed: 0,text,target
0,switchfoot httptwitpiccom2y1zl awww thats a b...,0
1,is upset that he cant update his facebook by t...,0
2,kenichan i dived many times for the ball manag...,0
3,my whole body feels itchy and like its on fire,0
4,nationwideclass no its not behaving at all im ...,0


In [None]:
X = data['text']
Y = data['target']
x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size = 0.33, random_state = 42)

In [None]:
# Aqui creamos el vocabulario
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorize_layer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(x_train).batch(128)
vectorize_layer.adapt(text_ds)

##Descarga y preparacion del word embedding

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2021-09-26 17:01:31--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-09-26 17:01:31--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-09-26 17:01:32--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... failed: Connection timed out.
Retrying.

--2021-09-26 17:03:45--  (try: 2)  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Conn

In [None]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


Aqui preparamos la matriz de embedding

In [None]:
voc = vectorize_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
num_tokens = len(vectorize_layer.get_vocabulary()) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


Converted 16346 words (3654 misses)


Aqui convertimos los datos en arreglos numpy de acuerdo al vocabulario creado anteriormente

In [None]:
x_train = vectorize_layer(np.array([[s] for s in x_train])).numpy()
x_val = vectorize_layer(np.array([[s] for s in x_val])).numpy()

y_train = np.array(y_train)
y_val = np.array(y_val)

##Construccion del modelo y entrenamiento

In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(2, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         2000200   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         82048 

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=1e-3)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)
model.fit(x_train, y_train, batch_size=128, epochs=5, validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9ebae23d50>

##Pruebas con el modelo

In [None]:
class_names = ['Negative', 'Positive']
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorize_layer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

A countinuacion se pueden ver algunos comentarios sexistas que son clasificados como positivos por el modelo, demostrando así un sesgo de género en los datos.

In [None]:
probabilities = end_to_end_model.predict([["Don’t be such a girl"]])
class_names[np.argmax(probabilities[0])]

'Positive'

In [None]:
probabilities = end_to_end_model.predict([["Micromachisms are silly, there are more important things to worry about"]])
class_names[np.argmax(probabilities[0])]

'Positive'

In [None]:
probabilities = end_to_end_model.predict([["Always a bridesmaid, never a bride implies a person's goal should be marriage"]])
class_names[np.argmax(probabilities[0])]

'Positive'

In [None]:
probabilities = end_to_end_model.predict([["Man is to Computer Programmer as Woman is to Homemaker"]]) #Es el titulo de un paper que me encontré
class_names[np.argmax(probabilities[0])]

'Positive'

In [None]:
probabilities = end_to_end_model.predict([["She has sex with everyone, she's a bitch"]])
class_names[np.argmax(probabilities[0])]

'Positive'