In [1]:
!pip install transformers

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/21/02/ae8e595f45b6c8edee07913892b3b41f5f5f273962ad98851dc6a564bbb9/transformers-4.31.0-py3-none-any.whl.metadata
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.14.1 from https://files.pythonhosted.org/packages/7f/c4/adcbe9a696c135578cabcbdd7331332daad4d49b7c43688bc2d36b3a47d2/huggingface_hub-0.16.4-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.16.4-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

# Load data
df = pd.read_csv("all-data.csv", encoding="ISO-8859-1", header=None)
df.columns = ['Sentiment', 'Text']

# Basic list of stopwords
basic_stopwords = {
    'ourselves', 'hers', 'between', 'yourself', 'but', 
    'again', 'there', 'about', 'once', 'during', 'out', 
    'very', 'having', 'with', 'they', 'own', 'an', 'be', 
    'some', 'for', 'do', 'its', 'yours', 'such', 'into', 
    'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 
    'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 
    'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 
    'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 
    'down', 'should', 'our', 'their', 'while', 'above', 'both', 
    'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 
    'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 
    'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 
    'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 
    'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 
    'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 
    'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 
    'it', 'how', 'further', 'was', 'here', 'than'
}

# Text preprocessing
def preprocess_text(text):
    tokens = text.split()
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in basic_stopwords]
    return ' '.join(tokens)




df['Processed_Text'] = df['Text'].apply(preprocess_text)

# Sentiment encoding
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df['Encoded_Sentiment'] = df['Sentiment'].replace(label_map)

In [19]:
# Split data first
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Processed_Text'], df['Encoded_Sentiment'].values, test_size=0.2
)

# Tokenize the split data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=256, return_tensors='tf')
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=256, return_tensors='tf')


In [20]:
import tensorflow as tf
from tensorflow import keras

In [21]:
# Convert encodings to tf.Tensor
def encode_tf_tensors(encodings):
    return {
        'input_ids': tf.convert_to_tensor(encodings['input_ids'], dtype=tf.int32),
        'attention_mask': tf.convert_to_tensor(encodings['attention_mask'], dtype=tf.int32)
    }

train_data = (encode_tf_tensors(train_encodings), tf.convert_to_tensor(train_labels, dtype=tf.int64))
val_data = (encode_tf_tensors(val_encodings), tf.convert_to_tensor(val_labels, dtype=tf.int64))

# Load DistilBERT model for TensorFlow
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
model.fit(train_data[0], train_data[1], validation_data=val_data, epochs=3, batch_size=8)

# Evaluate the model
results = model.evaluate(val_data[0], val_data[1])
print(results)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
Epoch 2/3
Epoch 3/3
[0.5897536873817444, 0.8051546216011047]


In [22]:
model.save_pretrained("./saved_model/")
tokenizer.save_pretrained("./saved_model/")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')