# Neural Network for Text Classification

### Exploring the dataset srtucture

In [4]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

### Exploring the first observation of the dataset

In [8]:

dataset = load_dataset("ag_news")

# Access the 'train' split
train_dataset = dataset['train']

# Print the first observation
print(train_dataset[0])

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


### Budiling the initial neural network (baseline)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from datasets import load_dataset
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

# Load your dataset
data = load_dataset("ag_news")

train_texts  = data["train"]["text"]
train_labels = data["train"]["label"]
test_texts   = data["test"]["text"]
test_labels  = data["test"]["label"]

# Consider top 5000 frequent words
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(train_texts)

train_vectors = vectorizer.transform(train_texts)
test_vectors  = vectorizer.transform(test_texts)

# Convert the sklearn vectors to numpy arrays
train_vectors_arrays = train_vectors.toarray()
test_vectors_arrays  = test_vectors.toarray()

# Model Architecture
model = Sequential()
model.add(Dense(10, input_dim=5000, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(4,  activation='softmax'))  # Output layer with 4 classes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert labels to one-hot encoded vectors for multi-class classification
train_labels_onehot = to_categorical(train_labels, num_classes=4)
test_labels_onehot  = to_categorical(test_labels,   num_classes=4)

# Train the model
model.fit(train_vectors_arrays, train_labels_onehot, epochs=10, verbose=1)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(test_vectors_arrays, test_labels_onehot)
print(f"Test accuracy: {accuracy * 100:.2f}%")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 89.82%


## Final and tunned model

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from datasets import load_dataset
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load your dataset
data = load_dataset("ag_news")

train_texts  = data["train"]["text"]
train_labels = data["train"]["label"]
test_texts   = data["test"]["text"]
test_labels  = data["test"]["label"]

# Tokenization and TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(train_texts)

train_vectors = vectorizer.transform(train_texts)
test_vectors  = vectorizer.transform(test_texts)

# Convert the sklearn vectors to numpy arrays
train_vectors_arrays = train_vectors.toarray()
test_vectors_arrays  = test_vectors.toarray()

# Convert labels to one-hot encoded vectors for multi-class classification
train_labels_onehot = to_categorical(train_labels, num_classes=4)
test_labels_onehot  = to_categorical(test_labels, num_classes=4)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_vectors_arrays, train_labels_onehot, test_size=0.2, random_state=42)

# Model Architecture
model = Sequential()
model.add(Dense(128, input_dim=5000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(4, activation='softmax'))  # Output layer with 4 classes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(patience=3, monitor='val_loss', restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(test_vectors_arrays, test_labels_onehot)
print(f"Test accuracy: {test_accuracy * 100:.2f}%")

# Make predictions on the test data
y_pred = model.predict(test_vectors_arrays)
y_pred_labels = np.argmax(y_pred, axis=1)
test_accuracy = accuracy_score(test_labels, y_pred_labels)
print(f"Test accuracy (after argmax): {test_accuracy * 100:.2f}%")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Test accuracy: 91.03%
Test accuracy (after argmax): 91.03%


### Testing the Neural Network on user prompts

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from datasets import load_dataset
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load your dataset
data = load_dataset("ag_news")

train_texts  = data["train"]["text"]
train_labels = data["train"]["label"]
test_texts   = data["test"]["text"]
test_labels  = data["test"]["label"]

# Tokenization and TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(train_texts)

train_vectors = vectorizer.transform(train_texts)
test_vectors  = vectorizer.transform(test_texts)

# Convert the sklearn vectors to numpy arrays
train_vectors_arrays = train_vectors.toarray()
test_vectors_arrays  = test_vectors.toarray()

# Convert labels to one-hot encoded vectors for multi-class classification
train_labels_onehot = to_categorical(train_labels, num_classes=4)
test_labels_onehot  = to_categorical(test_labels, num_classes=4)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_vectors_arrays, train_labels_onehot, test_size=0.2, random_state=42)

# Model Architecture
model = Sequential()
model.add(Dense(128, input_dim=5000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(4, activation='softmax'))  # Output layer with 4 classes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(patience=3, monitor='val_loss', restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(test_vectors_arrays, test_labels_onehot)
print(f"Test accuracy: {test_accuracy * 100:.2f}%")

# Make predictions on the test data
y_pred = model.predict(test_vectors_arrays)
y_pred_labels = np.argmax(y_pred, axis=1)
test_accuracy = accuracy_score(test_labels, y_pred_labels)
print(f"Test accuracy (after argmax): {test_accuracy * 100:.2f}%")


category_mapping = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

# Function to predict text
def predict_text(input_text):
    new_text = [input_text]
    new_text_vectors = vectorizer.transform(new_text)
    new_text_vectors_array = new_text_vectors.toarray()
    
    predictions = model.predict(new_text_vectors_array, verbose=0)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Map numerical labels to category names
    predicted_category = category_mapping.get(predicted_labels[0], 'Unknown')
    
    return predicted_category

# Prompt for user input and make predictions
input_text = input("Enter your text: ")
predicted_category = predict_text(input_text)
print(f"Predicted category: {predicted_category}")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Test accuracy: 91.38%
Test accuracy (after argmax): 91.38%
Predicted category: Sci/Tech


## Saving the model results and architecture

In [15]:
model.save('/Users/gonzalovaldenebro/Library/CloudStorage/OneDrive-DrakeUniversity/CS 195/Fortnight 5/NeuralNetwork.keras')

## Saving the vectorizer

In [14]:
import joblib

# Save the vectorizer
joblib.dump(vectorizer, '/Users/gonzalovaldenebro/Library/CloudStorage/OneDrive-DrakeUniversity/CS 195/Fortnight 5/vectorizer.pkl')

['/Users/gonzalovaldenebro/Library/CloudStorage/OneDrive-DrakeUniversity/CS 195/Fortnight 5/vectorizer.pkl']