In [None]:
from datasets import load_dataset
import pandas as pd

dataset_name = "carblacac/twitter-sentiment-analysis"
train_len = 30000
test_len = 50

dataset = load_dataset(dataset_name)
train_df = pd.DataFrame(dataset['train'][:train_len])
test_df = pd.DataFrame(dataset['test'][:test_len])

In [2]:
train_df.head()

Unnamed: 0,text,feeling
0,@fa6ami86 so happy that salman won. btw the 1...,0
1,@phantompoptart .......oops.... I guess I'm ki...,0
2,@bradleyjp decidedly undecided. Depends on the...,1
3,@Mountgrace lol i know! its so frustrating isn...,1
4,@kathystover Didn't go much of any where - Lif...,1


In [3]:
instructions = {
    'instruction_1': "Carefully read the following tweet. Assess the overall sentiment expressed by considering the choice of words and the tone. Label the tweet with '1' if the overall sentiment is positive, reflecting happiness or satisfaction, and '0' if it reflects a negative sentiment such as sadness or anger.",
    'instruction_2': "Analyze the mood conveyed in the tweet below, taking into account the language, emojis (if any), and contextual cues. Label the mood as '1' if the tweet communicates a positive, cheerful, or optimistic tone, and '0' if it conveys negativity, pessimism, or discontent."
}

def format_instruction(row, instruction):
    return f"{instruction}\n\n### Tweet: '{row['text']}'\n\n### Sentiment: {row['feeling']}"

train_df['input_1'] = train_df.apply(lambda x: format_instruction(x, instructions['instruction_1']), axis=1)
train_df['input_2'] = train_df.apply(lambda x: format_instruction(x, instructions['instruction_2']), axis=1)

train_df.head()

Unnamed: 0,text,feeling,input_1,input_2
0,@fa6ami86 so happy that salman won. btw the 1...,0,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."
1,@phantompoptart .......oops.... I guess I'm ki...,0,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."
2,@bradleyjp decidedly undecided. Depends on the...,1,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."
3,@Mountgrace lol i know! its so frustrating isn...,1,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."
4,@kathystover Didn't go much of any where - Lif...,1,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."


In [4]:
print(train_df.iloc[0]['input_1'])

Carefully read the following tweet. Assess the overall sentiment expressed by considering the choice of words and the tone. Label the tweet with '1' if the overall sentiment is positive, reflecting happiness or satisfaction, and '0' if it reflects a negative sentiment such as sadness or anger.

### Tweet: '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser'

### Sentiment: 0


# Fine-tuning

In [5]:
import torch
from datasets import load_dataset, load_metric
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
from tqdm.auto import tqdm

In [6]:
base_model = 'mistralai/Mistral-7B-v0.1'
new_model = 'models/Mistral-7B-sentiment-tuned'
output_dir = 'results/Mistral-7B-sentiment-tuned'

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.02s/it]


In [8]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [9]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [10]:
training_params = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="all",
)


In [11]:
from datasets import Dataset
hf_dataset = Dataset.from_pandas(train_df)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=hf_dataset,
    peft_config=peft_params,
    dataset_text_field="input_2",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

In [13]:
trainer.train()

Step,Training Loss
100,0.827
200,0.7033
300,0.6853
400,0.6917


TrainOutput(global_step=468, training_loss=0.7216785382001828, metrics={'train_runtime': 2059.9567, 'train_samples_per_second': 14.563, 'train_steps_per_second': 0.227, 'total_flos': 1.486091600168878e+17, 'train_loss': 0.7216785382001828, 'epoch': 0.9984})

In [14]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('models/Mistral-7B-sentiment-tuned/tokenizer_config.json',
 'models/Mistral-7B-sentiment-tuned/special_tokens_map.json',
 'models/Mistral-7B-sentiment-tuned/tokenizer.json')