In [1]:
from sklearn.datasets import fetch_20newsgroups
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
import numpy as np
from gensim.models import KeyedVectors







In [2]:
# load the data while excluding meta data
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
texts = data.data  # List of documents
labels = data.target  # Target labels

In [3]:
# Tokenize the text. this is done by assigning each word a number
tokenizer = Tokenizer(num_words=10000)
# fit_on_texts creates the vocabulary index based on word frequency
tokenizer.fit_on_texts(texts)
# texts_to_sequences converts the text to a list of integers
sequences = tokenizer.texts_to_sequences(texts)
# word_index is a dictionary of words and their uniquely assigned integers
word_index = tokenizer.word_index

# Pad the sequences. this is done to ensure that all sequences are of the same length
data = pad_sequences(sequences, maxlen=1000)

# Encode the labels. this is done to ensure that the labels are in the form of integers
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
# Convert the labels to categorical values
labels = to_categorical(encoded_labels)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


In [4]:
# set data set parameters which will be used in the model
vocab_size = 10000  
embedding_dim = 64  # The dimensionality of the embedding vectors
max_length = 1000   # The length of the input sequences
num_classes = labels.shape[1]  # The number of unique classes

In [5]:
# create a sequential model
model = Sequential()

# train our own embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))

# feed embedding to a convolutional layer and pool the result. this is done to reduce the dimensionality
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))

# Flatten the output to feed into a Dense layer, must reduce to 1 dimension
model.add(Flatten())

# Add a hidden layer
model.add(Dense(32, activation='relu'))

# Classification layer for output. use softmax to assign probabilities to each class
model.add(Dense(num_classes, activation='softmax'))





In [6]:


# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, Y_train, epochs=10, batch_size=128, validation_data=(X_test, Y_test))



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:

# Load Word2Vec embeddings
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 


In [8]:
embedding_dim = 300  # match embedding dimension to the word2vec embedding dimension

# Initialize the embedding matrix with an additional row for index 0
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))

for word, i in word_index.items():
    if i < vocab_size:  # Ensure indices are within the specified vocab_size
        if word in word_vectors.key_to_index:
            embedding_matrix[i] = word_vectors[word]


# Define model
model = Sequential()
# add word2vec embedding layer
model.add(Embedding(vocab_size + 1, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False)) 
# convolutional layer
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))

# Flatten the output to feed into a Dense layer
model.add(Flatten())

# hidden layer
model.add(Dense(32, activation='relu'))

# Classification layer
model.add(Dense(num_classes, activation='softmax'))



In [9]:


# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, Y_train, epochs=10, batch_size=128, validation_data=(X_test, Y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
