In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf


data = pd.read_csv('output.csv')


X_train, X_test, y_train, y_test = train_test_split(data['Sentence'], data['sentiment'], test_size=0.2, random_state=42)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

X_train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128, return_tensors='tf')
X_test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128, return_tensors='tf')

label_mapping = {'positive': 1, 'neutral': 0, 'negative': 2}
y_train = y_train.map(label_mapping)
y_test = y_test.map(label_mapping)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(X_train_encodings), y_train.values)).shuffle(len(X_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(X_test_encodings), y_test.values)).batch(32)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(train_dataset, epochs=3)

results = model.evaluate(test_dataset)
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

y_pred_prob = model.predict(dict(X_test_encodings))
y_pred = np.argmax(y_pred_prob['logits'], axis=1)

inverse_label_mapping = {v: k for k, v in label_mapping.items()}
y_pred_labels = pd.Series(y_pred).map(inverse_label_mapping)

print("Classification Report:")
print(classification_report(y_test, y_pred_labels))


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm






Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [5]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Sentence,sentiment
0,0,0,The GeoSolutions technology will leverage Bene...,Positive
1,1,1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",Negative
2,2,2,"For the last quarter of 2010 , Componenta 's n...",Positive
3,3,3,According to the Finnish-Russian Chamber of Co...,Neutral
4,4,4,The Swedish buyout firm has sold its remaining...,Neutral
