In [3]:
!pip install opendatasets
!pip install pandas

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import re
from keras.layers import TextVectorization
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator
import opendatasets as od
from nltk.corpus import stopwords
from typing import Tuple
import os
from transformers import BertTokenizer, BertModel



In [4]:
od.download(
    "https://www.kaggle.com/datasets/kouroshalizadeh/history-of-philosophy")

Dataset URL: https://www.kaggle.com/datasets/kouroshalizadeh/history-of-philosophy
Downloading history-of-philosophy.zip to ./history-of-philosophy


100%|██████████| 55.1M/55.1M [00:00<00:00, 66.1MB/s]





In [5]:
SCHOOLS = ['analytic','aristotle','german_idealism',
           'plato','continental','phenomenology',
           'rationalism','empiricism','feminism',
           'capitalism','communism','nietzsche',
           'stoicism']

In [6]:
# Load your datasets
def getData(validation_size:float=0.2,
            test_size:float=0.1,
            from_folder:str='',
            min_chars:int=None,
            max_chars:int=None)-> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    df = pd.read_csv(os.path.join(from_folder,'philosophy_data.csv'))
    development, test = train_test_split(df, test_size=test_size, stratify=df['school'], random_state=42, shuffle=True)
    #split vslidation ant train from train
    if validation_size == 0:
        return development, None, test
    train, validation = train_test_split(development,
                                         test_size=validation_size/(1 - test_size),
                                         stratify=development['school'],
                                         random_state=42,
                                         shuffle=True)

    train = reduceDataset(train, min_chars, max_chars)
    validation = reduceDataset(validation, min_chars, max_chars)
    test = reduceDataset(test, min_chars, max_chars)
    return train, validation, test

def reduceDataset(df:pd.DataFrame,
                  min_chars:int=None,
                  max_chars:int=None)-> pd.DataFrame:
    if min_chars is not None:
        df = df[(df['sentence_length'] >= min_chars)]
    if max_chars is not None:
        df = df[(df['sentence_length'] <= max_chars)]
    return df

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [8]:
import torch

def get_bert_embeddings(texts, tokenizer, model, batch_size=16, device='cpu'):
    embeddings = []
    model.to(device)

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state
            cls_embeddings = last_hidden_states[:, 0, :]  # Get the embeddings for [CLS] token

        embeddings.extend(cls_embeddings.cpu().numpy())

    return embeddings

device = torch.device("cuda" if torch.cuda.is_available()
                      else "mps" if torch.backends.mps.is_available()
                      else "cpu")

In [None]:
train, test, validation = getData(from_folder = 'history-of-philosophy')
df = pd.read_csv(os.path.join('history-of-philosophy','philosophy_data.csv'))

embeddings = get_bert_embeddings(df['sentence_str'].tolist(), tokenizer, model, device = device)

# inputs = tokenizer(df['sentence_str'].tolist(), return_tensors='tf', padding=True, truncation=True, max_length = 512)

In [None]:
# outputs = model(**inputs)
# embeddings = outputs.last_hidden_state

In [None]:
import pickle

with open('embeddings.pickle', 'wb') as handle:
    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from torch.utils.data import Dataset, DataLoader

class BERTEmbeddingsDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

labels = df['school']

# Create dataset and dataloader
label_to_index = {label: index for index, label in enumerate(SCHOOLS)}
labels = torch.tensor([label_to_index[label] for label in labels], dtype=torch.long)

dataset = BERTEmbeddingsDataset(embeddings, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:
rnn = torch.nn.LSTM()