In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets
import pandas as pd

twitter_dataset = "carblacac/twitter-sentiment-analysis"
alpace_dataset = "tatsu-lab/alpaca"

train_len = 30000
test_len = 50

dataset = load_dataset(twitter_dataset)

train_twitter = pd.DataFrame(dataset['train'][:train_len])
test_twitter = pd.DataFrame(dataset['test'][:test_len])
train_twitter['feeling'] = train_twitter['feeling'].astype(str)
test_twitter['feeling'] = test_twitter['feeling'].astype(str)


dataset_original = load_dataset(alpace_dataset)
train_alpaca = dataset_original['train'][:train_len]


In [2]:
train_twitter.head()

Unnamed: 0,text,feeling
0,@fa6ami86 so happy that salman won. btw the 1...,0
1,@phantompoptart .......oops.... I guess I'm ki...,0
2,@bradleyjp decidedly undecided. Depends on the...,1
3,@Mountgrace lol i know! its so frustrating isn...,1
4,@kathystover Didn't go much of any where - Lif...,1


In [3]:
instructions = {
    'instruction_1': "Carefully read the following tweet. Assess the overall sentiment expressed by considering the choice of words and the tone. Label the tweet with '1' if the overall sentiment is positive, reflecting happiness or satisfaction, and '0' if it reflects a negative sentiment such as sadness or anger.",
    'instruction_2': "Analyze the mood conveyed in the tweet below, taking into account the language, emojis (if any), and contextual cues. Label the mood as '1' if the tweet communicates a positive, cheerful, or optimistic tone, and '0' if it conveys negativity, pessimism, or discontent."
}

def format_instruction(row, instruction):
    # Adding the instruction before the tweet and appending the sentiment analysis result
    return f"{instruction}\n\n### Tweet: '{row['text']}'\n\n### Sentiment: {row['feeling']}"

# Applying the function to the DataFrame
train_twitter['input_1'] = train_twitter.apply(lambda x: format_instruction(x, instructions['instruction_1']), axis=1)
train_twitter['input_2'] = train_twitter.apply(lambda x: format_instruction(x, instructions['instruction_2']), axis=1)

train_twitter.head()

Unnamed: 0,text,feeling,input_1,input_2
0,@fa6ami86 so happy that salman won. btw the 1...,0,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."
1,@phantompoptart .......oops.... I guess I'm ki...,0,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."
2,@bradleyjp decidedly undecided. Depends on the...,1,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."
3,@Mountgrace lol i know! its so frustrating isn...,1,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."
4,@kathystover Didn't go much of any where - Lif...,1,Carefully read the following tweet. Assess the...,"Analyze the mood conveyed in the tweet below, ..."


In [4]:
print(train_twitter.iloc[0]['input_2'])

Analyze the mood conveyed in the tweet below, taking into account the language, emojis (if any), and contextual cues. Label the mood as '1' if the tweet communicates a positive, cheerful, or optimistic tone, and '0' if it conveys negativity, pessimism, or discontent.

### Tweet: '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser'

### Sentiment: 0


In [5]:
hf_dataset = Dataset.from_pandas(train_twitter[["input_2", "feeling"]])
hf_dataset = hf_dataset.rename_column("input_2", "input").rename_column("feeling", "response")

if isinstance(train_alpaca, dict):
    train_alpaca = Dataset.from_dict({
        'input': train_alpaca['text'],
        'response': train_alpaca['output']  # Assuming 'response' is the column you want to keep
    })  # Directly rename while converting and keep additional column

combined_dataset = concatenate_datasets([hf_dataset, train_alpaca]).shuffle(seed=42)
print(combined_dataset)


Dataset({
    features: ['input', 'response'],
    num_rows: 60000
})


# Fine-tuning

In [6]:
import torch
from datasets import load_dataset, load_metric
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
from tqdm.auto import tqdm

In [7]:
base_model = 'mistralai/Mistral-7B-v0.1'
new_model = 'models/Mistral-7B-instruct-sentiment-tuned'
output_dir = 'results/Mistral-7B-instruct-finetuned'

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.95s/it]


In [9]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [10]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [11]:
training_params = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="all",
)


In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=combined_dataset,
    peft_config=peft_params,
    dataset_text_field="input",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

Map: 100%|██████████| 60000/60000 [00:04<00:00, 12503.49 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
trainer.train()

Step,Training Loss
100,0.9134
200,0.7846
300,0.7635
400,0.763
500,0.7754
600,0.7648
700,0.7616
800,0.7598
900,0.7543
1000,0.7611


TrainOutput(global_step=1875, training_loss=0.7658797587076823, metrics={'train_runtime': 6913.2369, 'train_samples_per_second': 8.679, 'train_steps_per_second': 0.271, 'total_flos': 3.04636049690198e+17, 'train_loss': 0.7658797587076823, 'epoch': 1.0})

In [14]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('models/Mistral-7B-instruct-sentiment-tuned/tokenizer_config.json',
 'models/Mistral-7B-instruct-sentiment-tuned/special_tokens_map.json',
 'models/Mistral-7B-instruct-sentiment-tuned/tokenizer.json')

# Evaluating

In [3]:
from transformers import logging
logging.set_verbosity_error()

from transformers import AutoModelForCausalLM, AutoTokenizer

new_model = 'models/Mistral-7B-instruct-sentiment-tuned'

model = AutoModelForCausalLM.from_pretrained(new_model)
tokenizer = AutoTokenizer.from_pretrained(new_model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]


In [9]:
prompt = "Analyze the mood conveyed in the tweet below, taking into account the language, emojis (if any), and contextual cues. Label the mood as '1' if the tweet communicates a positive, cheerful, or optimistic tone, and '0' if it conveys negativity, pessimism, or discontent.\n\n### Tweet: '@justineville ...yeahhh. ) i'm 39 tweets from 1,600!'\n\n### Sentiment:"

pipe = pipeline(task="text-generation", model=new_model, tokenizer=new_model)
result = pipe(prompt, max_new_tokens=2)

print(result[0]['generated_text'].split('\n\n### Sentiment: ')[1])

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


1
