In [1]:
# This is just to make sure your machine has the same versions as mine. if it matches, you can delete this.
import numpy as np
import tensorflow as tf

print("NumPy version:", np.__version__)
print("TensorFlow version:", tf.__version__)

NumPy version: 1.26.4
TensorFlow version: 2.18.0


In [16]:
# Import necessary libraries
import nltk
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import reuters, stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report, accuracy_score
import json

# Download NLTK resources
nltk.download('reuters')
nltk.download('stopwords')

# Unified Tokenizer Class
class UnifiedTokenizer:
    def __init__(self, stop_words, min_length=4, use_stemming=False):
        self.stop_words = stop_words
        self.min_length = min_length
        self.use_stemming = use_stemming
        self.stemmer = PorterStemmer() if use_stemming else None

    def tokenize(self, text):
        text = text.lower()
        text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)
        text = re.sub(r'\d+', '', text)
        tokens = text.split()
        tokens = [
            self.stemmer.stem(word) if self.use_stemming else word
            for word in tokens
            if word not in self.stop_words and len(word) >= self.min_length
        ]
        return tokens

# Reuters Preprocessor Class
class ReutersPreprocessor:
    def __init__(self, tokenizer, max_features=10000, max_sequence_length=500):
        self.tokenizer = tokenizer
        self.max_features = max_features
        self.max_sequence_length = max_sequence_length
        self.vocabulary = {}

    def preprocess(self):
        documents = reuters.fileids()
        texts = [reuters.raw(doc_id) for doc_id in documents]
        labels = [reuters.categories(doc_id) for doc_id in documents]

        data = pd.DataFrame({'document_id': documents, 'text': texts, 'labels': labels})
        multi_label_data = data[data['labels'].apply(len) > 1]

        multi_label_data.loc[:, 'tokens'] = multi_label_data['text'].apply(self.tokenizer.tokenize)

        # Create BoW features
        vectorizer = CountVectorizer(max_features=self.max_features, tokenizer=lambda x: x, preprocessor=lambda x: x)
        X_bow = vectorizer.fit_transform(multi_label_data['tokens']).toarray()

        # Perform LDA for topic extraction
        lda = LatentDirichletAllocation(n_components=15, random_state=42)
        X_lda = lda.fit_transform(X_bow)

        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(multi_label_data['labels'])

        print("Preprocessing complete.")
        return X_bow, X_lda, y, mlb.classes_

# Initialize tokenizer and preprocessor
stop_words = set(stopwords.words('english')) | {"reuters", "news"}
tokenizer = UnifiedTokenizer(stop_words=stop_words, min_length=4, use_stemming=True)
preprocessor = ReutersPreprocessor(tokenizer=tokenizer)
X_bow, X_lda, y, class_labels = preprocessor.preprocess()

# Combine BoW and LDA features
X_combined = np.hstack((X_bow, X_lda))

# Save preprocessed data for future use
np.save("X_combined.npy", X_combined)
np.save("y.npy", y)
np.save("class_labels.npy", class_labels)

# Model Building
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_combined.shape[1],)),
    BatchNormalization(),
    Dropout(0.2),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(len(class_labels), activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

# Train the model
history = model.fit(
    X_combined, y,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Save the model
model.save("best_reuters_model_with_lda.h5")

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_combined, y, verbose=2)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Predict and generate classification report
y_pred = model.predict(X_combined)
y_pred_binary = (y_pred > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y, y_pred_binary, target_names=class_labels))

overall_accuracy = accuracy_score(y, y_pred_binary)
print(f"\nOverall Test Accuracy: {overall_accuracy:.4f}")

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/rebeccavannostrand/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rebeccavannostrand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multi_label_data.loc[:, 'tokens'] = multi_label_data['text'].apply(self.tokenizer.tokenize)


Preprocessing complete.
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.0208 - loss: 0.8699 - val_accuracy: 0.0276 - val_loss: 0.6954 - learning_rate: 1.0000e-04
Epoch 2/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0557 - loss: 0.8317 - val_accuracy: 0.0491 - val_loss: 0.6944 - learning_rate: 1.0000e-04
Epoch 3/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1000 - loss: 0.8071 - val_accuracy: 0.0828 - val_loss: 0.6932 - learning_rate: 1.0000e-04
Epoch 4/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1324 - loss: 0.7881 - val_accuracy: 0.1104 - val_loss: 0.6918 - learning_rate: 1.0000e-04
Epoch 5/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1855 - loss: 0.7700 - val_accuracy: 0.1258 - val_loss: 0.6898 - learning_rate: 1.0000e-04
Epoch 6/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0



51/51 - 0s - 3ms/step - accuracy: 0.4337 - loss: 0.1766
Test Loss: 0.1766, Test Accuracy: 0.4337
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Classification Report:
                 precision    recall  f1-score   support

            acq       0.94      0.62      0.75        77
           alum       0.00      0.00      0.00         8
         barley       0.91      0.57      0.70        51
            bop       0.86      0.82      0.84        74
        carcass       0.98      0.89      0.94        57
     castor-oil       0.50      0.50      0.50         2
          cocoa       1.00      0.75      0.86        12
        coconut       1.00      0.20      0.33         5
    coconut-oil       0.71      0.71      0.71         7
         coffee       1.00      0.67      0.80        27
         copper       0.93      0.62      0.74        21
     copra-cake       0.33      0.67      0.44         3
           corn       0.95      0.96      0.96       237
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
