In [45]:
import pandas as pd
import string  # For remove_punctuation
from nltk.corpus import stopwords  # For remove_stop_words
from nltk.stem import PorterStemmer  # For stem_words
from nltk.stem import WordNetLemmatizer  # For lemmatize_words
import nltk
from transformers import BertTokenizer, BertModel



In [46]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 636kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 
Downloading model.safetensors: 100%|██████████| 440M/440M [00:17<00:00, 25.1MB/s] 


In [48]:
# Function to remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punct = text.translate(translator)
    return text_without_punct

# Function to lowercase text
def lowercase_text(text):
    return text.lower()

# Function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to get BERT embeddings
def get_bert_embeddings(text):
    # Tokenize the text and convert it to input IDs
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the embeddings for the [CLS] token (you can also use [CLS], [SEP], or average over all tokens)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.tolist()


In [35]:
# Read the CSV file
data = pd.read_csv('FinalDataF.csv', encoding='latin-1')

In [43]:
def preprocess_and_get_embeddings(row):
    title = row['Title']
    abstract = row['Abstract']

    # Check for NaN values and return them unchanged
    if pd.isna(title):
        title = ""
    if pd.isna(abstract):
        abstract = ""

    title = remove_punctuation(title)
    title = lowercase_text(title)
    title = remove_stop_words(title)
    title_embeddings = get_bert_embeddings(title)

    abstract = remove_punctuation(abstract)
    abstract = lowercase_text(abstract)
    abstract = remove_stop_words(abstract)
    abstract_embeddings = get_bert_embeddings(abstract)

    return pd.Series({'Title_Embeddings': title_embeddings, 'Abstract_Embeddings': abstract_embeddings})


In [44]:
# Apply the preprocessing and embedding function to each row of the DataFrame
embeddings_data = data.apply(preprocess_and_get_embeddings, axis=1)

# Concatenate the embeddings with the original DataFrame
data = pd.concat([data, embeddings_data], axis=1)

# Save the DataFrame with embeddings to a new CSV file
data.to_csv('thesis_dataset_with_embeddings.csv', index=False)

NameError: name 'torch' is not defined