In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Load the training dataset
train_data = pd.read_csv('train.tsv', delimiter='\t')  # Replace with the actual path

# Load the testing dataset
test_data = pd.read_csv('test.tsv', delimiter='\t')  # Replace with the actual path

# Initialize a BERT tokenizer from the pre-trained model 'bert-base-uncased' and set the maximum length of the tokenized sequences to 128 (u can change if u want)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# Preprocessing training data 
# Tokenize the 'Phrase' column of the training data using the BERT tokenizer
X_train_tokenized = tokenizer(
    train_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

# Convert the tokenized training data into a hashable format (dictionary of TensorFlow tensors), so we can use .fit() later on
X_train_hashable = {
    'input_ids': X_train_tokenized['input_ids'],
    'token_type_ids': X_train_tokenized['token_type_ids'],
    'attention_mask': X_train_tokenized['attention_mask'],
}

y_train = train_data['Sentiment'].values


# --- Preprocess testing data ---
# Tokenize the 'Phrase' column of the testing data using the same BERT tokenizer
X_test_tokenized = tokenizer(
    test_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

# Convert the tokenized testing data into a hashable format (dictionary of TensorFlow tensors)
X_test_hashable = {
    'input_ids': X_test_tokenized['input_ids'],
    'token_type_ids': X_test_tokenized['token_type_ids'],
    'attention_mask': X_test_tokenized['attention_mask'],
}


# Load the pre-trained BERT model for sequence classification, set num_labels to 5 since it can be 0 1 2 3 or 4
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Compile the model
bert_model.compile(optimizer=Adam(learning_rate=2e-5), 
                   loss=SparseCategoricalCrossentropy(from_logits=True),
                   metrics=[SparseCategoricalAccuracy()])
# Train the model
bert_model.fit(X_train_hashable, y_train, epochs=3, batch_size=32)


# Predict the sentiment labels of the testing data
predictions = bert_model.predict(X_test_hashable) # will return unnormalized probability predictions (aka decimal values)
logits = predictions.logits
probabilities = tf.nn.softmax(logits) # use softmax() to convert raw scores (logits) into a probability distribution (class labels 0 1 2 3 4)
predicted_class = tf.argmax(probabilities, axis=-1)

print("Predicted labels for testing data: ", np.array(predicted_class).tolist())



2023-12-11 12:51:13.527089: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 12:51:13.831399: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 12:51:13.831446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 12:51:13.894913: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-11 12:51:14.034943: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 12:51:14.037224: I tensorflow/core/platform/cpu_feature_guard.cc:1

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Load the training dataset
train_data = pd.read_csv('train.tsv', delimiter='\t')  # Replace with the actual path

# Load the testing dataset
test_data = pd.read_csv('test.tsv', delimiter='\t')  # Replace with the actual path

# Initialize a BERT tokenizer from the pre-trained model 'bert-base-uncased' and set the maximum length of the tokenized sequences to 128 (u can change if u want)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# --- Preprocessing training data ---
# Tokenize the 'Phrase' column of the training data using the BERT tokenizer
X_train_tokenized = tokenizer(
    train_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)
# Convert the tokenized training data into a hashable format (dictionary of TensorFlow tensors), so we can use .fit() later on
X_train_hashable = {
    'input_ids': X_train_tokenized['input_ids'],
    'token_type_ids': X_train_tokenized['token_type_ids'],
    'attention_mask': X_train_tokenized['attention_mask'],
}

y_train = train_data['Sentiment'].values


# --- Preprocess testing data ---
# Tokenize the 'Phrase' column of the testing data using the same BERT tokenizer
X_test_tokenized = tokenizer(
    test_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

# Convert the tokenized testing data into a hashable format (dictionary of TensorFlow tensors)
X_test_hashable = {
    'input_ids': X_test_tokenized['input_ids'],
    'token_type_ids': X_test_tokenized['token_type_ids'],
    'attention_mask': X_test_tokenized['attention_mask'],
}


# Load the pre-trained BERT model for sequence classification, set num_labels to 5 since it can be 0 1 2 3 or 4
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Compile the model
bert_model.compile(optimizer=Adam(learning_rate=2e-5), 
                   loss=SparseCategoricalCrossentropy(from_logits=True),
                   metrics=[SparseCategoricalAccuracy()])

# Train the model
bert_model.fit(X_train_hashable, y_train, epochs=3, batch_size=32)


# Predict the sentiment labels of the testing data
predictions = bert_model.predict(X_test_hashable) # will return unnormalized probability predictions (aka decimal values)
logits = predictions.logits
probabilities = tf.nn.softmax(logits) # use softmax() to convert raw scores (logits) into a probability distribution (class labels 0 1 2 3 4)
predicted_class = tf.argmax(probabilities, axis=-1)

print("Predicted labels for testing data: ", np.array(predicted_class).tolist())

2023-12-07 00:25:05.236887: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-07 00:25:05.508681: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-07 00:25:05.508750: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-07 00:25:05.581535: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-07 00:25:05.719716: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-07 00:25:05.721500: I tensorflow/core/platform/cpu_feature_guard.cc:1

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [15]:
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import tensorflow as tf

# Load the training dataset
train_data = pd.read_csv('train.tsv', delimiter='\t')  # Replace with the actual path

# Load the testing dataset
test_data = pd.read_csv('test.tsv', delimiter='\t')  # Replace with the actual path

# Initialize a BERT tokenizer from the pre-trained model 'bert-base-uncased' and set the maximum length of the tokenized sequences to 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# --- Preprocessing training data ---
# Tokenize the 'Phrase' column of the training data using the BERT tokenizer within the model
X_train_tokenized = tokenizer(
    train_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

y_train = train_data['Sentiment'].values

# --- Preprocess testing data ---
# Tokenize the 'Phrase' column of the testing data using the BERT tokenizer within the model
X_test_tokenized = tokenizer(
    test_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf',
    return_token_type_ids=False,  # Ensure token type IDs are not returned
    return_attention_mask=False  # Ensure attention mask is not returned
)

X_test_hashable = {
    'input_ids': X_test_tokenized['input_ids'],
}
# Load the pre-trained BERT model for sequence classification, set num_labels to 5 since it can be 0, 1, 2, 3, or 4
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Compile the model
bert_model.compile(optimizer=Adam(learning_rate=2e-5),
                   loss=SparseCategoricalCrossentropy(from_logits=True),
                   metrics=[SparseCategoricalAccuracy()])

# Train the model
bert_model.fit(X_train_tokenized, y_train, epochs=3, batch_size=32)

# Predict the sentiment labels of the testing data
predictions = bert_model.predict(X_test_tokenized)
predicted_class = tf.argmax(predictions.logits, axis=-1)

print("Predicted labels for testing data: ", np.array(predicted_class).tolist())


ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [3]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import tensorflow as tf

# Load the training dataset
train_data = pd.read_csv('train.tsv', delimiter='\t')  # Replace with the actual path

# Load the testing dataset
test_data = pd.read_csv('test.tsv', delimiter='\t')  # Replace with the actual path

# Initialize a BERT tokenizer from the pre-trained model 'bert-base-uncased' and set the maximum length of the tokenized sequences to 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# Tokenize the 'Phrase' column of the training data using the BERT tokenizer
X_train_tokenized = tokenizer(
    train_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

y_train = train_data['Sentiment'].values

# Tokenize the 'Phrase' column of the testing data using the BERT tokenizer
X_test_tokenized = tokenizer(
    test_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

# Convert the tokenized testing data into a hashable format (dictionary of TensorFlow tensors)
X_test_hashable = {
    'input_ids': X_test_tokenized['input_ids'][:, 0, :],
    'token_type_ids': X_test_tokenized['token_type_ids'][:, 0, :],
    'attention_mask': X_test_tokenized['attention_mask'][:, 0, :],
}

# Load the pre-trained BERT model for sequence classification, set num_labels to 5 since it can be 0, 1, 2, 3, or 4
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Compile the model
bert_model.compile(optimizer=Adam(learning_rate=2e-5),
                   loss=SparseCategoricalCrossentropy(from_logits=True),
                   metrics=[SparseCategoricalAccuracy()])

# Train the model
bert_model.fit(X_train_tokenized, y_train, epochs=3, batch_size=32)

# Predict the sentiment labels of the testing data
predictions = bert_model.predict(X_test_hashable)
predicted_class = tf.argmax(predictions.logits, axis=-1)

print("Predicted labels for testing data: ", np.array(predicted_class).tolist())


ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [4]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import tensorflow as tf

# Load the training dataset
train_data = pd.read_csv('train.tsv', delimiter='\t')  # Replace with the actual path

# Load the testing dataset
test_data = pd.read_csv('test.tsv', delimiter='\t')  # Replace with the actual path

# Initialize a BERT tokenizer from the pre-trained model 'bert-base-uncased' and set the maximum length of the tokenized sequences to 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# Tokenize the 'Phrase' column of the training data using the BERT tokenizer
X_train_tokenized = tokenizer(
    train_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

y_train = train_data['Sentiment'].values

# Tokenize the 'Phrase' column of the testing data using the BERT tokenizer
X_test_tokenized = tokenizer(
    test_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

# Convert the tokenized testing data into a hashable format (dictionary of TensorFlow tensors)
X_test_hashable = {
    'input_ids': X_test_tokenized['input_ids'][:, 0, :],
    'token_type_ids': X_test_tokenized['token_type_ids'][:, 0, :],
    'attention_mask': X_test_tokenized['attention_mask'][:, 0, :],
}

# Load the pre-trained BERT model for sequence classification, set num_labels to 5 since it can be 0, 1, 2, 3, or 4
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Compile the model
bert_model.compile(optimizer=Adam(learning_rate=2e-5),
                   loss=SparseCategoricalCrossentropy(from_logits=True),
                   metrics=[SparseCategoricalAccuracy()])

# Train the model
bert_model.fit(X_train_tokenized, y_train, epochs=3, batch_size=32)

# Predict the sentiment labels of the testing data
predictions = bert_model.predict(X_test_hashable)
predicted_class = tf.argmax(predictions.logits, axis=-1)

print("Predicted labels for testing data: ", np.array(predicted_class).tolist())


ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [10]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import tensorflow as tf

# Load the training dataset
train_data = pd.read_csv('train.tsv', delimiter='\t')  # Replace with the actual path

# Load the testing dataset
test_data = pd.read_csv('test.tsv', delimiter='\t')  # Replace with the actual path
test_data = test_data[test_data['Phrase'].apply(lambda x: isinstance(x, str))]

# Initialize a BERT tokenizer from the pre-trained model 'bert-base-uncased' and set the maximum length of the tokenized sequences to 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# Tokenize the 'Phrase' column of the training data using the BERT tokenizer
X_train_tokenized = tokenizer(
    train_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

y_train = train_data['Sentiment'].values

# Tokenize the 'Phrase' column of the testing data using the BERT tokenizer
X_test_tokenized = tokenizer(
    test_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

# Convert the tokenized testing data into a hashable format (dictionary of TensorFlow tensors)
X_test_hashable = {
    'input_ids': X_test_tokenized['input_ids'],
    'token_type_ids': X_test_tokenized['token_type_ids'],
    'attention_mask': X_test_tokenized['attention_mask'],
}

# Load the pre-trained BERT model for sequence classification, set num_labels to 5 since it can be 0, 1, 2, 3, or 4
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Compile the model
optimizer = Adam(learning_rate=2e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy('accuracy')

bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
bert_model.fit(X_train_tokenized, y_train, epochs=3, batch_size=32)

# Predict the sentiment labels of the testing data
predictions = bert_model.predict(X_test_hashable)
predicted_labels = tf.argmax(predictions.logits, axis=1).numpy()

print("Predicted labels for testing data: ", np.array(predicted_class).tolist())


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


ValueError: Cannot generate a hashable key for IteratorSpec(({'input_ids': TensorSpec(shape=(None, 80), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 80), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 80), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None)),) because the _serialize() method returned an unsupproted value of type <class 'transformers.tokenization_utils_base.BatchEncoding'>

In [1]:
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import tensorflow as tf

# Load the training dataset (replace with your actual path)
train_data = pd.read_csv('train.tsv', delimiter='\t')

# Load the testing dataset (replace with your actual path)
test_data = pd.read_csv('test.tsv', delimiter='\t')

# Remove rows with non-string values in the 'Phrase' column
train_data = train_data[train_data['Phrase'].apply(lambda x: isinstance(x, str))]
test_data = test_data[test_data['Phrase'].apply(lambda x: isinstance(x, str))]

# Initialize a BERT tokenizer from the pre-trained model 'bert-base-uncased' and set the maximum length of the tokenized sequences
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# Tokenize the 'Phrase' column of the training data
X_train_tokenized = tokenizer(
    train_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

# Tokenize the 'Phrase' column of the testing data
X_test_tokenized = tokenizer(
    test_data['Phrase'].tolist(),
    truncation=True,
    padding=True,
    max_length=max_length,
    return_tensors='tf'
)

# Extract labels from training data
y_train = train_data['Sentiment'].values

# Load the pre-trained BERT model for sequence classification, set num_labels to 5 since it can be 0, 1, 2, 3, or 4
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Compile the model
optimizer = Adam(learning_rate=2e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy('accuracy')

bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
bert_model.fit(X_train_tokenized, y_train, epochs=3, batch_size=32)

# Predict the sentiment labels for the test data
predictions = bert_model.predict(X_test_tokenized)
predicted_labels = tf.argmax(predictions.logits, axis=1).numpy()

# Print the final predictions
print(predicted_labels)


2023-12-07 03:06:26.879487: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-07 03:06:27.186175: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-07 03:06:27.186215: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-07 03:06:27.256222: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-07 03:06:27.409565: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-07 03:06:27.412061: I tensorflow/core/platform/cpu_feature_guard.cc:1

Epoch 1/3


ValueError: Cannot generate a hashable key for IteratorSpec(({'input_ids': TensorSpec(shape=(None, 80), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 80), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 80), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None)),) because the _serialize() method returned an unsupproted value of type <class 'transformers.tokenization_utils_base.BatchEncoding'>