In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

BERT embeddings, stemming, stop-words removal

In [30]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize NLTK's PorterStemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(w) for w in words if w.lower() not in stop_words]
    return ' '.join(words)

# Load dataset
df = pd.read_json('smaller_dataset.json')
df = df.dropna(subset=['headline'])
df = df.dropna(subset=['short_description'])


# Preprocess the text data
df['preprocessed_headline'] = df['headline'].apply(preprocess_text)
df['preprocessed_description'] = df['short_description'].apply(preprocess_text)

# BERT Embedding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="tf", max_length=512, truncation=True)
    outputs = bert_model(inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Apply BERT embeddings
df['headline_embeddings'] = df['preprocessed_headline'].apply(get_bert_embeddings)
df['description_embeddings'] = df['preprocessed_description'].apply(get_bert_embeddings)

# Combine the embeddings
df['combined_embeddings'] = df.apply(lambda x: np.concatenate((x['headline_embeddings'], x['description_embeddings'])), axis=1)

[nltk_data] Downloading package punkt to /Users/galgantar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/galgantar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenc

Predictions (deep learning model)

In [31]:
# Define X and Y
X = np.array(df['combined_embeddings'].tolist())
X = np.array(tf.reshape(X, [174, 2*768]))
Y = pd.get_dummies(df['category']).values  # One-hot encoding of categories

# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(Y.shape[1], activation='softmax')  # Prediction head
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, epochs=10, batch_size=32)

# Make predictions
predictions = model.predict(X_test)
predicted_categories = np.argmax(predictions, axis=1)

# Display predictions
for pred, y in zip(predicted_categories, np.argmax(Y_test, axis=1)):
    print(f"Correct: {df['category'].unique()[y]}, Prediction: {df['category'].unique()[pred]}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Correct: WELLNESS, Prediction: WELLNESS
Correct: WELLNESS, Prediction: WELLNESS
Correct: WELLNESS, Prediction: WELLNESS
Correct: WELLNESS, Prediction: WELLNESS
Correct: SPORTS, Prediction: SPORTS
Correct: WELLNESS, Prediction: WELLNESS
Correct: WELLNESS, Prediction: WELLNESS
Correct: SPORTS, Prediction: SPORTS
Correct: SPORTS, Prediction: WELLNESS
Correct: WELLNESS, Prediction: WELLNESS
Correct: WELLNESS, Prediction: WELLNESS
Correct: SPORTS, Prediction: SPORTS
Correct: SPORTS, Prediction: SPORTS
Correct: WELLNESS, Prediction: WELLNESS
Correct: SPORTS, Prediction: WELLNESS
Correct: SPORTS, Prediction: SPORTS
Correct: SPORTS, Prediction: SPORTS
Correct: SPORTS, Prediction: SPORTS
Correct: WELLNESS, Prediction: WELLNESS
Correct: WELLNESS, Prediction: WELLNESS
Correct: SPORTS, Prediction: SPORTS
Correct: STYLE & BEAUTY, Prediction: WELLNESS
Correct: SPORTS, Prediction: SPORTS
Cor