In [1]:
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neural_network import MLPClassifier


In [2]:
df = pd.read_csv('vectorized_data.zip')

In [3]:
train_dataset = df["Poll Responses Response"].sample(n=1000).reset_index(drop=True).values
test_dataset = df["Poll Responses Response"].sample(n=100).reset_index(drop=True).values

In [4]:
import tensorflow as tf

tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [10]:
import torch

torch.has_mps
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def preprocess_function(examples):
   return tokenizer(examples["Poll Responses Response"], truncation=True)
 
tokenized_train = tokenizer(list(train_dataset), truncation=True, padding=True, return_tensors = 'tf', max_length=128)
tokenized_test = tokenizer(list(test_dataset), truncation=True, padding=True, return_tensors = 'tf', max_length=128)

True

In [6]:
# from transformers import DataCollatorWithPadding
# import torch
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# from transformers import AutoModelForSequenceClassification
# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


# import numpy as np
# from datasets import load_metric
 
# def compute_metrics(eval_pred):
#    load_accuracy = load_metric("accuracy")
#    load_f1 = load_metric("f1")
  
#    logits, labels = eval_pred
#    predictions = np.argmax(logits, axis=-1)
#    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
#    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
#    return {"accuracy": accuracy, "f1": f1}


In [11]:
import transformers
from tokenizers import BertWordPieceTokenizer
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased' , lower = True)
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=True)
fast_tokenizer

Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [12]:
# Encode the training data
encoded_train_data = tokenizer(list(train_dataset), padding=True, truncation=True, return_tensors='tf')
encoded_test_data = tokenizer(list(test_dataset), padding=True, truncation=True, return_tensors='tf')

# Print the encoded data
print(encoded_train_data)

{'input_ids': <tf.Tensor: shape=(1000, 160), dtype=int32, numpy=
array([[ 101, 2057, 4342, ...,    0,    0,    0],
       [ 101, 1999, 1996, ...,    0,    0,    0],
       [ 101, 1045, 4342, ...,    0,    0,    0],
       ...,
       [ 101, 1045, 2052, ...,    0,    0,    0],
       [ 101, 1996, 6950, ...,    0,    0,    0],
       [ 101, 2065, 2057, ...,    0,    0,    0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1000, 160), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}


In [20]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import TFBertModel
import numpy as np 
from tqdm import tqdm

# Load the pre-trained BERT model
bert_model = TFBertModel.from_pretrained('distilbert-base-uncased')

# Define batch size
batch_size = 32

# Get the number of samples in the data
num_samples = len(encoded_train_data['input_ids'])

# Create a list to store the embeddings
embeddings = []

# Loop through the data in batches
for i in tqdm(range(0, num_samples, batch_size)):
    # Get a batch of data
    batch_input_ids = encoded_train_data['input_ids'][i:i+batch_size]
    batch_attention_mask = encoded_train_data['attention_mask'][i:i+batch_size]
    
    # Get the embeddings for the batch
    batch_outputs = bert_model(batch_input_ids, attention_mask=batch_attention_mask)
    
    # Add the embeddings to the list
    embeddings.extend(batch_outputs[0].numpy())

# Convert the embeddings list to a numpy array
embeddings = np.array(embeddings)


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFBertModel: ['vocab_transform', 'vocab_projector', 'distilbert', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert']
You should probably TRAIN this model on a down-stre

In [21]:
embeddings

array([[[ 5.7048255e-01,  7.5101060e-01,  1.1159126e+00, ...,
         -2.2140479e+00,  2.6243791e-01, -1.3024724e+00],
        [-4.5024175e-02,  1.8007126e-01,  1.0318813e+00, ...,
         -1.6748793e+00, -3.4144036e-02, -2.7090931e-01],
        [-9.7379518e-01,  5.4173652e-02,  1.3830606e+00, ...,
         -5.0433254e-01,  9.2972231e-01,  4.1275772e-01],
        ...,
        [ 1.2142802e-01,  1.1661167e+00, -2.6225573e-01, ...,
         -2.2971077e+00,  3.7783045e-01, -7.6307744e-01],
        [ 3.8621145e-01,  3.0223271e-01, -1.4362383e-01, ...,
         -2.5260911e+00,  4.2233166e-01,  2.6217338e-01],
        [-4.6997750e-01,  1.8405275e-01,  5.8947802e-02, ...,
         -2.6523879e+00, -1.4358427e-01, -1.1274283e+00]],

       [[ 6.5191102e-01,  8.3716387e-01,  1.1096551e+00, ...,
         -2.2670887e+00,  3.5694817e-01, -1.0991659e+00],
        [ 7.0750540e-01,  1.2565295e+00,  1.1829517e+00, ...,
         -1.7433158e+00,  1.9523279e-01,  1.5556608e-01],
        [-3.3808988e-01, 

In [19]:
embeddings_test = []
for i in tqdm(range(0, num_samples, batch_size)):
    # Get a batch of data
    batch_input_ids = encoded_test_data['input_ids'][i:i+batch_size]
    batch_attention_mask = encoded_test_data['attention_mask'][i:i+batch_size]
    
    # Get the embeddings for the batch
    batch_outputs = bert_model(batch_input_ids, attention_mask=batch_attention_mask)
    
    # Add the embeddings to the list
    embeddings_test.extend(batch_outputs[0].numpy())

# Convert the embeddings list to a numpy array
embeddings_test = np.array(embeddings_test)


100%|██████████| 32/32 [00:05<00:00,  6.00it/s]


In [25]:
# import keras modules
from keras.models import Sequential
from tensorflow.keras.models import Model
from keras.layers import Dense, Dropout, Input

In [None]:
# turn the grade class from 1-5 to 0-4
y_train = y_train - 1
y_test = y_test -1

In [None]:
# Define the neural network architecture
num_classes = 5
inputs = Input(shape=(embeddings.shape[1],))
x = Dense(64, activation='relu')(inputs)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(embeddings, train_labels_one_hot, epochs=10, batch_size=32)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_embeddings, test_labels_one_hot, batch_size=32)

In [14]:
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# from transformers import TFBertModel

# # Load the pre-trained BERT model
# bert_model = TFBertModel.from_pretrained('distilbert-base-uncased')

# # Get the embeddings for the encoded data
# outputs = bert_model(encoded_train_data['input_ids'], attention_mask=encoded_train_data['attention_mask'])


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFBertModel: ['vocab_transform', 'vocab_projector', 'distilbert', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
bert_model.summary()