In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
import pickle
import numpy as np
print("Libraries are Loaded")

Libraries are Loaded


In [2]:
# Load the preprocessed data into a DataFrame
data = pd.read_csv('preprocessed_data.csv')

In [3]:
# Fill NaN values in 'Title' and 'Abstract' columns with empty strings
data['Title'] = data['Title'].fillna('')
data['Abstract'] = data['Abstract'].fillna('')

In [4]:
# Extract the 'Title' and 'Abstract' columns
titles = data['Title'].tolist()
abstracts = data['Abstract'].tolist()

In [5]:
# BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [6]:
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Ensure that the input is a list of strings
        if isinstance(text, str):
            inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
            outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state
            mean_pooled = torch.mean(last_hidden_states, dim=1).squeeze().detach().numpy()
            embeddings.append(mean_pooled)
        else:
            # If the input is not a string, append a placeholder embedding
            embeddings.append([0.0] * 327)  # Change 768 to the actual size of BERT embeddings
    return embeddings


In [7]:
# Get BERT embeddings for titles and abstracts
title_embeddings = get_bert_embeddings(titles)
abstract_embeddings = get_bert_embeddings(abstracts)

In [12]:
# Save the BERT embeddings as pickle files
with open('title_bert_embeddings.pkl', 'wb') as f:
    pickle.dump(title_embeddings, f)

with open('abstract_bert_embeddings.pkl', 'wb') as f:
    pickle.dump(abstract_embeddings, f)

print("Data saved successfully as ")

Data saved successfully as 


In [13]:
# Create DataFrames for titles and abstracts with embeddings
title_data = pd.DataFrame({'Title': titles})
title_data['Embeddings'] = pd.DataFrame(title_embeddings).values.tolist()

abstract_data = pd.DataFrame({'Abstract': abstracts})
abstract_data['Embeddings'] = pd.DataFrame(abstract_embeddings).values.tolist()


In [14]:
# Concatenate the embeddings with the original DataFrame
titleData = pd.concat([title_data['Title'], pd.DataFrame(title_data['Embeddings'].tolist(), columns=[f'embedding_{i}' for i in range(768)])], axis=1)
abstractData = pd.concat([abstract_data['Abstract'], pd.DataFrame(abstract_data['Embeddings'].tolist(), columns=[f'embedding_{i}' for i in range(768)])], axis=1)


In [15]:
# Save DataFrames to CSV files
titleData.to_csv('title_data_with_embeddings.csv', index=False)
abstractData.to_csv('abstract_data_with_embeddings.csv', index=False)
