<a href="https://colab.research.google.com/github/gamidirohan/MachineLearning-Lab/blob/main/Extracted_Features_using_BERT%2C_RoBERTa%2C_and_XLNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install transformers



Initializations

In [10]:
# Path to your dataset folder
dataset_folder = r"/content/drive/MyDrive/Datasets/My Dataset"

# List of poem categories
poem_categories = ["acrostic", "ballad", "epigram", "haiku", "limerick", "sestina", "sonnet", "villanelle"]

Extract features using BERT

In [11]:
import csv
from transformers import BertModel, BertTokenizer
import os

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract features from a poem using the tokenizer
def extract_features(poem_path, tokenizer, model):
    with open(poem_path, 'r', encoding='utf-8') as file:
        text = file.read()
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        # Use the last hidden state (embeddings) of the [CLS] token for sentence representation
        embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
        return embeddings

# Open CSV file to write embeddings
csv_filename = "poems_embeddings_bert.csv"  # Specify only the filename

# Prepare the header of the CSV file with the appropriate number of feature columns
header = ['feature_' + str(i) for i in range(768)]  # Assuming BERT-base with 768 features
header.append('label')

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:  # Open the file in the current directory
    writer = csv.writer(csvfile)
    writer.writerow(header)

    # Iterate through each poem category
    for category in poem_categories:
        category_folder = os.path.join(dataset_folder, category)

        # Iterate through each poem file in the category folder
        for filename in os.listdir(category_folder):
            if filename.endswith(".txt"):
                poem_path = os.path.join(category_folder, filename)

                # Extract features (embeddings) from the poem
                features = extract_features(poem_path, tokenizer, model)

                # Flatten the features array and append the category label
                row = features.flatten().tolist()
                row.append(category)

                # Write the features and label to the CSV file
                writer.writerow(row)

print("Embeddings saved to:", csv_filename)

Embeddings saved to: poems_embeddings_bert.csv


Extract features using RoBERTa

In [12]:
import os
import csv
from transformers import RobertaModel, RobertaTokenizer

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Function to extract features from a poem using the tokenizer
def extract_features(poem_path, tokenizer, model):
    with open(poem_path, 'r', encoding='utf-8') as file:
        text = file.read()
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        # Use the last hidden state (embeddings) of the [CLS] token for sentence representation
        embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
        return embeddings

# Open CSV file to write embeddings
csv_filename = "poems_embeddings_roberta.csv"

# Prepare the header of the CSV file with the appropriate number of feature columns
header = ['feature_' + str(i) for i in range(768)]  # Assuming RoBERTa-base with 768 features
header.append('label')

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)

    # Iterate through each poem category
    for category in poem_categories:
        category_folder = os.path.join(dataset_folder, category)

        # Iterate through each poem file in the category folder
        for filename in os.listdir(category_folder):
            if filename.endswith(".txt"):
                poem_path = os.path.join(category_folder, filename)

                # Extract features (embeddings) from the poem
                features = extract_features(poem_path, tokenizer, model)

                # Flatten the features array and append the category label
                row = features.flatten().tolist()
                row.append(category)

                # Write the features and label to the CSV file
                writer.writerow(row)

print("Embeddings saved to:", csv_filename)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings saved to: poems_embeddings_roberta.csv


Extract features using XLNet

In [None]:
import os
import csv
from transformers import XLNetModel, XLNetTokenizer

# Initialize XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')

# Function to extract features from a poem using the tokenizer
def extract_features(poem_path, tokenizer, model):
    with open(poem_path, 'r', encoding='utf-8') as file:
        text = file.read()
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        # Use the last hidden state (embeddings) of the [CLS] token for sentence representation
        embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
        return embeddings

# Open CSV file to write embeddings
csv_filename = "poems_embeddings_xlnet.csv"

# Prepare the header of the CSV file with the appropriate number of feature columns
header = ['feature_' + str(i) for i in range(768)]  # Assuming XLNet-base with 768 features
header.append('label')

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)

    # Iterate through each poem category
    for category in poem_categories:
        category_folder = os.path.join(dataset_folder, category)

        # Iterate through each poem file in the category folder
        for filename in os.listdir(category_folder):
            if filename.endswith(".txt"):
                poem_path = os.path.join(category_folder, filename)

                # Extract features (embeddings) from the poem
                features = extract_features(poem_path, tokenizer, model)

                # Flatten the features array and append the category label
                row = features.flatten().tolist()
                row.append(category)

                # Write the features and label to the CSV file
                writer.writerow(row)

print("Embeddings saved to:", csv_filename)

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
