In [5]:
import pandas as pd

df = pd.read_csv('/kaggle/input/tweet-extraction-train/train.csv')

In [6]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [7]:
df = df.dropna(subset=['text'])

In [8]:
print('max tweet:', df['text'].apply(len).max())


max tweet: 141


In [9]:
!pip install datasets



In [10]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

In [11]:
train_dataset = dataset.train_test_split(test_size=0.1)['train']
eval_dataset = dataset.train_test_split(test_size=0.1)['test']

#Tokenize the data

In [12]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def prepare_train_features(examples):
    sentiment = examples['sentiment']
    tweet = examples['text']
    span = examples['selected_text']

    tokenized_qa = tokenizer(sentiment, # question
                             tweet, # context
                             padding='max_length',
                             return_offsets_mapping=True)

    qa_tokens = tokenized_qa["input_ids"]

    span_tokens = tokenizer(span)["input_ids"]

    tokenized_qa["start_positions"] = []
    tokenized_qa["end_positions"] = []

    start_char = tweet.find(span)
    end_char = start_char + len(span)

    offsets = tokenized_qa.pop("offset_mapping")
    start_token = end_token = None
    for idx, (start, end) in enumerate(offsets):
      if start <= start_char < end:
        start_token = idx
      if start < end_char <= end:
        end_token = idx
        break

    tokenized_qa["start_positions"].append(start_token)
    tokenized_qa["end_positions"].append(end_token)

    if start is None or end is None:
      print(tweet + ' -> ' + span)
      print(f' {qa_tokens} -> {span_tokens}')
      return None

    return tokenized_qa


In [15]:
tokenized_train_dataset = train_dataset.map(prepare_train_features)


Map:   0%|          | 0/24732 [00:00<?, ? examples/s]

In [16]:
tokenized_eval_dataset = eval_dataset.map(prepare_train_features)

Map:   0%|          | 0/2748 [00:00<?, ? examples/s]

In [17]:
tokenized_train_dataset

Dataset({
    features: ['textID', 'text', 'selected_text', 'sentiment', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 24732
})

In [18]:
tokenized_eval_dataset

Dataset({
    features: ['textID', 'text', 'selected_text', 'sentiment', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 2748
})

In [19]:
small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_eval_dataset.shuffle(seed=42).select(range(1000))

In [20]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [21]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"finetune-BERT-tweet",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [22]:

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [23]:

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [24]:
import os
os.environ["WANDB_MODE"] = "disabled"

In [25]:
import torch

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,1.755234
2,No log,1.47894
3,No log,1.455392




TrainOutput(global_step=189, training_loss=1.984864734468006, metrics={'train_runtime': 222.1353, 'train_samples_per_second': 13.505, 'train_steps_per_second': 0.851, 'total_flos': 783890270208000.0, 'train_loss': 1.984864734468006, 'epoch': 3.0})

In [29]:
from transformers import AutoModelForTokenClassification, AutoTokenizer


# Load your trained model and tokenizer (replace 'path_to_save_model' with the actual path)
model = AutoModelForQuestionAnswering.from_pretrained('/kaggle/working/finetune-BERT-tweet/checkpoint-189')
tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/finetune-BERT-tweet/checkpoint-189')



In [32]:
# Sample input tweet and sentiment (question)
tweet = " you guys didn`t say hi or answer my questions yesterday  but nice songs."
sentiment = "positive"

# Tokenize the inputs
inputs = tokenizer(
    sentiment,  # the sentiment question
    tweet,      # the tweet context
    padding=True,
    truncation=True,
    return_tensors="pt"
)


In [33]:
import torch

# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the predicted start and end positions
    start_position = torch.argmax(start_logits, dim=-1)
    end_position = torch.argmax(end_logits, dim=-1)

    # Convert token IDs to actual text spans
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    # Extract the sentiment span from the tweet
    sentiment_span = tokens[start_position:end_position+1]
    sentiment_text = tokenizer.convert_tokens_to_string(sentiment_span)
    
    print(f"Predicted sentiment span: {sentiment_text}")


Predicted sentiment span: nice songs.
