**Imports**

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM, Dense, GRU, Dropout, Flatten, Bidirectional
from tensorflow.keras.models import Model
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from re import X
from google.colab import drive

**Mount con google drive para cargar los archivos glove y csv**

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


**Carga del embedding**

In [3]:
embeddingdim = 50 #Puede ser 50, 100, 200 0 300
print(f'Loading GloVe {embeddingdim}-d embedding... ', end='')
word2vec = {}
with open(f'/content/drive/My Drive/2. clasificación/glove.6B.{embeddingdim}d.txt') as f:
    for line in f:
        values = line.split()
        word2vec[values[0]] = np.asarray(values[1:], dtype='float32')
print(f'done ({len(word2vec)} word vectors loaded)')

Loading GloVe 50-d embedding... done (400000 word vectors loaded)


**Carga del archivo csv**

In [4]:
df = pd.read_csv('/content/drive/My Drive/2. clasificación/papers.csv')

**Extracción de los datos relevantes**

In [5]:
texts = df['abstract'].tolist()
keywords = df['keywords'].tolist()

**Creacion de un objeto Tokenizer y ajuste de su vocabulario a los textos de entrada**

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_index =  tokenizer.word_index

**Se aplica tokenización a los textos y se convierten las palabras en secuencias de enteros**

In [7]:
sequences = tokenizer.texts_to_sequences(df['abstract'])

maxlen = df.abstract.str.split().str.len().max()
print(maxlen)
sequences = [seq[:maxlen] for seq in sequences]
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

312


**Se dividen las etiquetas en vectores one-hot**

In [None]:
labels = df['keywords']
one_hot_labels = pd.get_dummies(labels)
print(one_hot_labels)

**Se divide el conjunto de datos en entrenamiento y prueba**

In [9]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, one_hot_labels, test_size=0.2, random_state=42)

**Cargamos sólo las palabras elegidas de nuestro conjunto de datos**

In [10]:
vocab_size = 10000
num_words = min(vocab_size, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embeddingdim))
for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

**Creacion de la capa de embedding**

In [11]:
embedding_layer = tf.keras.layers.Embedding(
  input_dim=num_words,
  output_dim=embeddingdim,
  weights=[embedding_matrix],
  input_length=maxlen,
  trainable=False,
)

**Definicion del modelo de redes convolucionales**

In [12]:
def cnn_model(vocab_size, input_length, num_filters, filter_sizes, output_dim, dense_units):
    inputs = Input(shape=(input_length,))
    embedding = embedding_layer(inputs)
    pooled_outputs = []
    for filter_size in filter_sizes:
        x = Conv1D(num_filters, filter_size, activation='relu')(embedding)
        x = MaxPooling1D()(x)
        x = Flatten()(x)
        pooled_outputs.append(x)
    x = tf.concat(pooled_outputs, axis=1)
    outputs = Dense(output_dim, activation='sigmoid')(x)
    model = Model(inputs, outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
    return model

**Definicion del modelo de redes recurrentes**

In [13]:
def rnn_model(vocab_size, input_length, output_dim, dense_units):
    inputs = Input(shape=(input_length,))
    embedding = embedding_layer(inputs)
    x = GRU(128, return_sequences=True)(embedding)
    x = GRU(128)(x)
    outputs = Dense(output_dim, activation='sigmoid')(x)
    model = Model(inputs, outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
    return model

**Ejemplo de uso de la CNN**

In [None]:
num_filters = 128
filter_sizes = [3, 4, 5]
output_dim = len(one_hot_labels.columns)
dense_units = 10
model = cnn_model(vocab_size, maxlen, num_filters, filter_sizes, output_dim, dense_units)
print(model.summary())

epochs = 100
batch_size = 64

early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5, mode='min')

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), shuffle=True, callbacks=[early_stopping_callback])

loss, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

**Ejemplo de uso de la RNN**

In [None]:
output_dim = len(one_hot_labels.columns)
dense_units = 10
model = rnn_model(vocab_size, maxlen, output_dim, dense_units)
print(model.summary())

epochs = 100
batch_size = 64

early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5, mode='min')

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), shuffle=True, callbacks=[early_stopping_callback])

loss, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

**Se aplica tokenización a los nuevos textos y convierte las palabras en secuencias de enteros**

In [21]:
new_texts = df['abstract'].tolist()
new_keywords = df['keywords'].tolist()
new_sequences = tokenizer.texts_to_sequences(new_texts)

**Se pasan las secuencias de enteros a través de una nueva capa de pad_sequences**

In [22]:
X_new = pad_sequences(new_sequences, maxlen=maxlen, padding='post', truncating='post')

**Se predicen las etiquetas correspondientes a los nuevos textos utilizando el modelo entrenado**

In [23]:
y_pred = model.predict(X_new)



**Recupera las palabras clave predichas y las convierte en texto**

In [24]:
predicted_labels = one_hot_labels.columns[np.argmax(y_pred, axis=-1)]
predicted_labels = [''.join(list(predicted_labels[i])) for i in range(predicted_labels.shape[0])]

**Se imprimen los resultados**

In [None]:
for i in range(len(new_texts)):
    print("Abstract: ", new_texts[i])
    print("Keywords reales: ", new_keywords[i])
    print("Keywords predichas: ", predicted_labels[i])