In [1]:
import re
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch.nn as nn
from sklearn.metrics import accuracy_score
import ast

In [7]:
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
embedding_model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

# Train Preprocessing

In [None]:
df = pd.read_csv('/kaggle/input/nn-text-classfication/train.csv')

In [None]:
df['Discussion'] = df['Discussion'].fillna('No Text')


In [None]:
def replace_dates(text):
    date_pattern = r'\b(\d{1,2}-[A-Za-z]{3}|\b[A-Za-z]+ \d{1,2}(\w{2})?)\b'
    return re.sub(date_pattern, '[DATE]', text)

df['Discussion'] = df['Discussion'].apply(replace_dates)

In [None]:
batch_size = 50
total_samples = len(df)  

embedding_data = []

start_idx = 8500

for idx in range(start_idx, total_samples, batch_size):
    end_idx = min(idx + batch_size, total_samples)
    batch = df.iloc[idx:end_idx]

    for index, row in batch.iterrows():
        text = row['Discussion']

        inputs = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt', max_length=512)

        with torch.no_grad():
            outputs = embedding_model(**inputs)
            last_hidden_states = outputs.last_hidden_state

        sentence_embedding = last_hidden_states.mean(dim=1).squeeze().tolist()

        embedding_data.append({
            'SampleID': row['SampleID'],
            'Category': row['Category'],
            'Sentence_Embedding': sentence_embedding
        })

    embedding_df = pd.DataFrame(embedding_data)

    embedding_df.to_csv('/kaggle/working/second_batch_embeddings.csv', index=False)
    
    print(f"Finished processing {end_idx} samples.")

print(f"Processing completed for {total_samples} samples.")

In [None]:
df_first = pd.read_csv('/kaggle/input/embeddings-1/kaggle_first_batch_embeddings.csv')  # First dataset
df_second = pd.read_csv('/kaggle/input/second-embeddings/second_batch_embeddings.csv')  # Second dataset

df_combined = pd.concat([df_first, df_second], ignore_index=True)

# Save the combined dataset to a new CSV
df_combined.to_csv('/kaggle/working/Final_BERT_Embeddings_NN.csv', index=False)

print("Datasets have been successfully combined and saved to 'Final_BERT_Embeddings_NN.csv'.")

# Test Preprocessing

In [2]:
df = pd.read_csv('/kaggle/input/nn-text-classfication/test.csv')

In [3]:
df.head()

Unnamed: 0,SampleID,Discussion
0,1,Managing cash flow effectively is crucial for ...
1,2,Civic engagement plays a key role in a democra...
2,3,Proper warm-ups and cool-downs are essential t...
3,4,Data security is a growing concern as more peo...
4,5,"Investing in stocks can be risky, but with car..."


In [4]:
# Fill NaN
df['Discussion'] = df['Discussion'].fillna('No Text')

In [5]:
def replace_dates(text):
    date_pattern = r'\b(\d{1,2}-[A-Za-z]{3}|\b[A-Za-z]+ \d{1,2}(\w{2})?)\b'
    return re.sub(date_pattern, '[DATE]', text)

df['Discussion'] = df['Discussion'].apply(replace_dates)

In [9]:
batch_size = 50
total_samples = len(df)  

embedding_data = []

start_idx = 0

for idx in range(start_idx, total_samples, batch_size):
    end_idx = min(idx + batch_size, total_samples)
    batch = df.iloc[idx:end_idx]

    for index, row in batch.iterrows():
        text = row['Discussion']

        inputs = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt', max_length=512)

        with torch.no_grad():
            outputs = embedding_model(**inputs)
            last_hidden_states = outputs.last_hidden_state

        sentence_embedding = last_hidden_states.mean(dim=1).squeeze().tolist()

        embedding_data.append({
            'SampleID': row['SampleID'],
            'Sentence_Embedding': sentence_embedding
        })

    embedding_df = pd.DataFrame(embedding_data)

    embedding_df.to_csv('/kaggle/working/second_batch_embeddings.csv', index=False)
    
    print(f"Finished processing {end_idx} samples.")

print(f"Processing completed for {total_samples} samples.")

Finished processing 50 samples.
Finished processing 100 samples.
Finished processing 150 samples.
Finished processing 200 samples.
Finished processing 250 samples.
Finished processing 300 samples.
Finished processing 350 samples.
Finished processing 400 samples.
Finished processing 450 samples.
Finished processing 500 samples.
Finished processing 550 samples.
Finished processing 600 samples.
Finished processing 650 samples.
Finished processing 700 samples.
Finished processing 750 samples.
Finished processing 800 samples.
Finished processing 850 samples.
Finished processing 900 samples.
Finished processing 950 samples.
Finished processing 1000 samples.
Finished processing 1050 samples.
Finished processing 1100 samples.
Finished processing 1150 samples.
Finished processing 1200 samples.
Finished processing 1250 samples.
Finished processing 1300 samples.
Finished processing 1350 samples.
Finished processing 1400 samples.
Finished processing 1450 samples.
Finished processing 1500 samples.
