In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/test-embeddings/second_batch_embeddings.csv
/kaggle/input/final-embeddings/second_batch_embeddings.csv
/kaggle/input/final-embeddings/Final_BERT_Embeddings_NN.csv


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model

In [3]:
# Load dataset
train_path = "/kaggle/input/final-embeddings/Final_BERT_Embeddings_NN.csv"


# Read the datasets
train_df = pd.read_csv(train_path)
train_df

Unnamed: 0,SampleID,Category,Sentence_Embedding
0,1,Sports,"[0.10802699625492096, -0.29468590021133423, 0...."
1,2,STEM,"[0.15774837136268616, -0.27033138275146484, 0...."
2,3,STEM,"[0.09479232132434845, -0.026669710874557495, 0..."
3,4,Sports,"[-0.05896478891372681, 0.03615675866603851, 0...."
4,5,Politics,"[0.12355563789606094, -0.0836852490901947, -0...."
...,...,...,...
24984,24985,Sports,"[0.04971267655491829, -0.2155923992395401, 0.3..."
24985,24986,Market & Economy,"[0.4561083912849426, -0.05284585803747177, 0.2..."
24986,24987,Market & Economy,"[0.2979573607444763, -0.21786299347877502, 0.1..."
24987,24988,Politics,"[0.03884076699614525, 0.04084499180316925, 0.2..."


In [4]:
# Define label mapping for categories
label_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
# Map the 'Category' column to numeric labels
train_df['Category'] = train_df['Category'].map(label_mapping)


# Split train data into train and validation sets (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(
    np.array(train_df['Sentence_Embedding'].apply(eval).tolist()), 
    train_df['Category'], 
    test_size=0.2, 
    random_state=42
)
# Reshape the data to (samples, sequence_length, embedding_dim)
# In this case, sequence_length = 1 (since each sentence has a single embedding)
X_train = np.expand_dims(X_train, axis=1)  # Shape becomes (num_samples, 1, 768)
X_val = np.expand_dims(X_val, axis=1)      # Shape becomes (num_samples, 1, 768)

In [5]:
from keras.saving import register_keras_serializable


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim)]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate
        })
        return config

# Define Position Encoding Layer with Dynamic Embedding Sizes
@register_keras_serializable()
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-2]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions

    def get_config(self):
        config = super().get_config()
        config.update({
            "maxlen": self.maxlen,
            "embed_dim": self.embed_dim
        })
        return config


# Define the model
embed_dim = 768  # Fixed embedding dimension for each token
maxlen = 16556  # Maximum sequence length for embeddings
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in the feed-forward network inside transformer

# Use Input(shape=(None,)) to allow for variable-length embeddings
inputs = layers.Input(shape=(None, embed_dim))  # Fixed embedding dimension (768), variable sequence length

# Embedding layer with position encoding
embedding_layer = TokenAndPositionEmbedding(maxlen, embed_dim)
x = embedding_layer(inputs)

# Transformer block
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)




# Pooling and output layers
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)  # Assuming the model has an intermediate dense layer with 20 units
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(5, activation="softmax")(x)  # 5 classes for the output

In [6]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Number of epochs to wait before stopping if no improvement
    restore_best_weights=True  # Restore the weights of the best epoch
)

model = keras.Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


model.summary()

In [7]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10




[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m310s[0m 489ms/step - accuracy: 0.4902 - loss: 1.2228 - val_accuracy: 0.7111 - val_loss: 0.7837
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 478ms/step - accuracy: 0.6829 - loss: 0.8325 - val_accuracy: 0.7241 - val_loss: 0.7491
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 487ms/step - accuracy: 0.7036 - loss: 0.7893 - val_accuracy: 0.7173 - val_loss: 0.7788
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 483ms/step - accuracy: 0.7160 - loss: 0.7611 - val_accuracy: 0.7299 - val_loss: 0.7219
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m307s[0m 491ms/step - accuracy: 0.7226 - loss: 0.7375 - val_accuracy: 0.7211 - val_loss: 0.7617
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m306s[0m 490ms/step - accuracy: 0.7251 - loss: 0.7313 - val_accuracy: 0.7195 - val_loss: 0.7600
Epoch 7/10
[1m

In [10]:
test_df = pd.read_csv('/kaggle/input/test-embeddings/second_batch_embeddings.csv')

In [11]:
test_df

Unnamed: 0,SampleID,Sentence_Embedding
0,1,"[0.15690432488918304, -0.060619909316301346, 0..."
1,2,"[0.3401837944984436, -0.2515227794647217, 0.17..."
2,3,"[0.09620901942253113, -0.008898760192096233, -..."
3,4,"[0.27392739057540894, -0.07054565101861954, 0...."
4,5,"[0.19433511793613434, -0.1849549561738968, 0.2..."
...,...,...
10552,10553,"[0.1671169400215149, -0.24635660648345947, 0.4..."
10553,10554,"[0.1798195242881775, 0.031179074198007584, 0.0..."
10554,10555,"[0.05961940437555313, -0.2453116476535797, 0.4..."
10555,10556,"[0.390685498714447, -0.009291000664234161, 0.2..."


In [14]:
import numpy as np
import pandas as pd
import ast

# Check the type of embeddings in the first few rows
print(test_df['Sentence_Embedding'].head())

def process_embeddings(embedding):
    # If the embedding is a string representation of a list, convert it
    if isinstance(embedding, str):
        try:
            # Parse string representation to list
            embedding = ast.literal_eval(embedding)
        except Exception as e:
            print(f"Error converting string to list: {e}")
            return np.zeros(768)  # Default zero vector in case of error

    # If it's already a numpy array, return it as is
    if isinstance(embedding, np.ndarray):
        return embedding
    # If it's a list, convert it to a numpy array
    elif isinstance(embedding, list):
        return np.array(embedding)
    else:
        # If it's neither, print a message and return a default value
        print("Unexpected embedding format")
        return np.zeros(768)  # default zero vector of length 768

# Apply the processing function to ensure all embeddings are numpy arrays
X_test = np.array(test_df['Sentence_Embedding'].apply(process_embeddings).tolist())

# Ensure X_test is the correct shape (num_samples, 1, 768)
X_test = np.expand_dims(X_test, axis=1)  # Shape becomes (num_samples, 1, 768)

# Check the shape of X_test to confirm it is correct
print(X_test.shape)

# Load the pre-trained model (replace with actual model path)

# Make predictions with the model
test_predictions = model.predict(X_test)

# Assuming it's a classification task and you want to get the predicted classes (using softmax output)
test_predicted_classes = test_predictions.argmax(axis=1)

# Create a DataFrame to hold the predictions along with SampleID
test_predictions_df = pd.DataFrame({
    'SampleID': test_df['SampleID'],
    'Prediction': test_predicted_classes
})

# Save the predictions to a CSV file
test_predictions_df.to_csv('test_predictions__epochs.csv', index=False)

# Print the predictions DataFrame
print(test_predictions_df)


0    [0.15690432488918304, -0.060619909316301346, 0...
1    [0.3401837944984436, -0.2515227794647217, 0.17...
2    [0.09620901942253113, -0.008898760192096233, -...
3    [0.27392739057540894, -0.07054565101861954, 0....
4    [0.19433511793613434, -0.1849549561738968, 0.2...
Name: Sentence_Embedding, dtype: object
(10557, 1, 768)
[1m  5/330[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 14ms/step   



[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step




       SampleID  Prediction
0             1           3
1             2           0
2             3           1
3             4           4
4             5           3
...         ...         ...
10552     10553           4
10553     10554           2
10554     10555           3
10555     10556           3
10556     10557           3

[10557 rows x 2 columns]


In [15]:
test_predictions_df['Prediction'].value_counts()

Prediction
4    2423
3    2407
2    2187
1    1775
0    1765
Name: count, dtype: int64