In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import os

In [2]:
def load_data(path = "./datasets"):
    # Get data from each line with label, doc_id, index & tfidf of its vocab
    def sparse_to_dense(sparse_r_d, vocab_size):
        # Init list size vocal size to store vocal 
        r_d = [0.0 for _ in range(vocab_size)]
            
        # Split space & : in context data of each line 
        # Get index (id vocal of each line) & tfidfs
        indices_and_tfidfs = sparse_r_d.split()
        for index_and_tfidf in indices_and_tfidfs:
            index = int(index_and_tfidf.split(':')[0])
            tfidf = float(index_and_tfidf.split(':')[1])
            r_d[index] = tfidf
        return np.array(r_d)    
                
    # Open file (newsgroup, id, context)
    with open(os.path.join(path, "data_tf_idf.txt")) as f:
        data_lines = f.read().splitlines()
    # Get size file vocal TF-IDF
    with open(os.path.join(path, "words_idfs.txt")) as f:
        vocab_size = len(f.read().splitlines())

    # Member store info of data points: tf_idf, news group, file name of text d
    data, labels = [], []
    # Iterating sequence of pairs with counter
    for data_id, d in enumerate(data_lines):
        features = d.split('<fff>')
        label, doc_id = int(features[0]), int(features[1])
        r_d = sparse_to_dense(sparse_r_d=features[2], vocab_size=vocab_size)

        # Append data & labels
        data.append(r_d)
        labels.append(label)
    return np.array(data), np.array(labels)

In [3]:
# Load data
X, y = load_data()

# Split Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) 

In [4]:
inputs = layers.Input(shape=(X_train.shape[1:]))
x = layers.Dense(64, activation='relu')(inputs)
x = layers.Dense(32, activation='relu')(x)
outputs = layers.Dense(20, activation='softmax')(x)

model = models.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 14230)]           0         
                                                                 
 dense (Dense)               (None, 64)                910784    
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 20)                660       
                                                                 
Total params: 913,524
Trainable params: 913,524
Non-trainable params: 0
_________________________________________________________________


In [5]:
# callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                  mode='max',
                                                  verbose=1,
                                                  patience=5, restore_best_weights=True)

In [6]:
# compile and fit model
model.compile(loss = keras.losses.sparse_categorical_crossentropy,
              optimizer = "adam",
              metrics=["accuracy"])

history = model.fit(X_train, y_train,
                    epochs = 20,
                    validation_data = (X_val, y_val),
                    callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 00009: early stopping


In [7]:
# Evaluate model on test data
model.evaluate(X_test, y_test)



[0.3790754973888397, 0.8920424580574036]

In [12]:
# Predict newsgroup
np.argmax(model.predict(X_test[:1]), axis= -1)

array([9], dtype=int64)