In [31]:
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import fetch_20newsgroups

X_train_text, Y_train = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'), return_X_y=True)
X_test_text, Y_test  = fetch_20newsgroups(subset="test", remove=('headers', 'footers', 'quotes'), return_X_y=True)

In [32]:
# Procese el texto del dataset usando el metodo tokenize visto en clases
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')

classes = np.unique(Y_train)
stop_words = set(stopwords.words('english'))

tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens

    text = ' '.join(words)
    return text

[nltk_data] Downloading package wordnet to /home/franco-
[nltk_data]     anfossi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
train_docs = []
test_docs = []

for raw_text in X_train_text:
    text = tokenize(raw_text)
    train_docs.append(text)
    
for raw_text in X_test_text:
    text = tokenize(raw_text)
    test_docs.append(text)

In [34]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_tokens = 50 ## Hyperparameter, input length

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs+test_docs)

X_train_vect = pad_sequences(tokenizer.texts_to_sequences(train_docs), maxlen=max_tokens, padding="post", truncating="post", value=0.)
X_test_vect  = pad_sequences(tokenizer.texts_to_sequences(test_docs), maxlen=max_tokens, padding="post", truncating="post", value=0.)


X_train_vect.shape, X_test_vect.shape

((11314, 50), (7532, 50))

In [35]:
# Particione le data 3, train en los primeros 10000 restantes en validation
X_validation_vect = X_train_vect[10000:]
X_train_vect = X_train_vect[:10000]

Y_validation = Y_train[10000:]
Y_train = Y_train[:10000]

In [36]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Input, Layer

inputs = Input(shape=(max_tokens, ))
embeddings_layer = Embedding(input_dim=len(tokenizer.index_word)+1, output_dim=50, input_length=max_tokens, trainable=True)
dense1 = Dense(256, activation="relu")
dense2 = Dense(128, activation="relu")
dense3 = Dense(64, activation="softmax")

class ReduceSumLayer(Layer):
    def call(self, inputs):
        return tf.reduce_sum(inputs, axis=1)

x = embeddings_layer(inputs)
x = ReduceSumLayer()(x) # puede usar mean u otras función de agregación de vectores. 
x = dense1(x)
x = dense2(x)
outputs = dense3(x)

model = Model(inputs=inputs, outputs=outputs)

model.summary()



In [37]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
model.fit(X_train_vect, Y_train, batch_size=32, epochs=20, callbacks=[callback], validation_data=(X_validation_vect, Y_validation))

Epoch 1/20


In [None]:
labels = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [30]:
from sklearn.metrics import classification_report

Y_preds = model.predict(X_test_vect).argmax(axis=-1)

print("\nClassification Report : ")
print(classification_report(Y_test, Y_preds, target_names=labels))

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Classification Report : 
                          precision    recall  f1-score   support

             alt.atheism       0.21      0.22      0.21       319
           comp.graphics       0.52      0.31      0.39       389
 comp.os.ms-windows.misc       0.39      0.55      0.46       394
comp.sys.ibm.pc.hardware       0.29      0.31      0.30       392
   comp.sys.mac.hardware       0.28      0.53      0.37       385
          comp.windows.x       0.70      0.45      0.54       395
            misc.forsale       0.77      0.39      0.52       390
               rec.autos       0.58      0.46      0.51       396
         rec.motorcycles       0.77      0.26      0.39       398
      rec.sport.baseball       0.76      0.35      0.48       397
        rec.sport.hockey       0.61      0.64      0.62       399
               sci.crypt       0.26      0.52      0.35       396
         sci.electronics       0.25     