# Fine-tuning an Text Generation LLM

This program will try to fine-tune a text generation LLM (facebook/bart-large-cnn) using a combination of object list and short captions to generate long/descriptive captions of the images

For the training purposes, object list will be extracted using text processing on the long captions. Short captions will be generated by summarizing the long captions using OpenAI API. Then the LLM namely `facebook/bart-large-cnn` will be fine-tuned to learn the generation of the long captions.

For the prediction purposes, object list will be generated using the object detection model (`retinanet-resnet50-fpn`) using the provided image. And the a caption generating model (`Salesforce/blip-image-captioning-base`) will be used to generate the short captions. Then the combination of `object list` and `short captions` will be passed as an input to the fine-tuned model to predict the long captions

In [3]:
!pip install accelerate -U
!pip install datasets
!pip install transformers
!pip install openai
!pip install sentencepiece
!pip install rouge_score
!pip install sentence_transformers

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/265.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/265.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
import nltk
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import words, stopwords
from nltk.stem import WordNetLemmatizer
import string
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Check GPU is detected by CUDA
import torch
print(torch.cuda.is_available())

True


In [None]:
# Check GPU is detected by CUDA
import torchvision
print(torch.cuda.is_available())

True


## Download pretrained model

Get the cooperhewitt pre-processed dataset from here:
https://utexas-my.sharepoint.com/personal/ctm2723_my_utexas_edu/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fctm2723%5Fmy%5Futexas%5Fedu%2FDocuments%2Fcooperhewitt%2Dimage%2Ddataset%2Ezip&parent=%2Fpersonal%2Fctm2723%5Fmy%5Futexas%5Fedu%2FDocuments&ga=1

In [None]:
import pandas as pd

df_train = pd.read_csv('cooperhewitt-image-dataset/ids_train.csv')
df_test = pd.read_csv('cooperhewitt-image-dataset/ids_test.csv')

Define a text processing function to extract the object list from the long captions

In [None]:
# Initializing the stop words of the English Language
stop_words = set(stopwords.words('english'))

# Initializaing the Wordnet Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def extract_object_list(long_caption):
  caption_tokens = word_tokenize(long_caption)

  # Converting to lower case
  caption_tokens = [token.lower() for token in caption_tokens]

  # Removing punctuation
  caption_tokens = [token for token in caption_tokens if token not in string.punctuation]

  # Removing stop words
  caption_tokens = [token for token in caption_tokens if token not in stop_words]

  # Perform wordnet lemmatization
  caption_tokens = [wordnet_lemmatizer.lemmatize(token) for token in caption_tokens]

  # Getting unique words
  caption_tokens = list(set(caption_tokens))

  return caption_tokens

In [None]:
df_train['objects_list'] = df_train['description'].apply(extract_object_list)
df_test['objects_list'] = df_test['description'].apply(extract_object_list)

Define a function to get the summary of the long caption using Open AI API. Don't forget to add your OpenAI API key here

In [4]:
from openai import OpenAI

def generate_short_captions_openai(long_caption):
  global short_captions
  global caption_count

  gpt_content = "Summarize the following caption to only 10 words: " + long_caption
  client = OpenAI(api_key = '') #specify your OpenAI Key here
  completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a text summarization assistant, skilled in summarizing long image captions to short captions"},
    {"role": "user", "content": gpt_content}
  ]
  )
  gpt_reply = completion.choices[0].message.content
  short_captions.append(gpt_reply)
  caption_count = caption_count + 1
  print(f"{caption_count}: {gpt_reply}")
  return gpt_reply

In [None]:
short_captions = []
caption_count = 0

df_train['short_caption'] = df_train['description'].apply(generate_short_captions_openai)
df_train.to_csv('llm_text_generation_train_df.csv', index=False)

with open('llm_text_generation_train_short_captions.pkl', 'wb') as f:
  pickle.dump(short_captions, f)

1001: Iridescent glass shade with orange-gold color, diamond pattern, round lip.
1002: Hot spring with colorful water and mountain under blue sky.
1003: Scalloped buttons with daisy design; slight variations; unpolished steel shanks.
1004: French captions depict travelers on road- St. Cloud, Poissy, Boules, Cornemuse.
1005: White clay tile with embossed inscription, rosette and mustard glaze.
1006: Satirical scenes in cotton print: soldiers, horse, woman, camp.
1007: Grid with brown lines and red/white flowers on black ground.
1008: Wild flowers in various colors on gray-white ground with border.
1009: Woven linen towel with rectangles, birds, flowers, and short fringe.
1010: Torus, leaf border, undulated tree stem with flowers and leaves.
1011: Cottage with tree, canal, and windmill in the distance.
1012: Stylized bush clover on stripes, with silk thread for support.
1013: Maple tree in center, trees on slope, distant background, cream margin.
1014: Pointed oval bowl with pierced star

In [None]:
short_captions = []
caption_count = 0

df_test['short_caption'] = df_test['description'].apply(generate_short_captions_openai)
df_test.to_csv('llm_text_generation_test_df.csv', index=False)

with open('llm_text_generation_test_short_captions.pkl', 'wb') as f:
  pickle.dump(short_captions, f)

## Loading the saved files

In [None]:
with open('llm_text_generation_train_short_captions.pkl', 'rb') as f:
  train_short_captions = pickle.load(f)

with open('llm_text_generation_test_short_captions.pkl', 'rb') as f:
  test_short_captions = pickle.load(f)

In [None]:
import pandas as pd
from ast import literal_eval

df_train = pd.read_csv('llm_text_generation_train_df.csv')
df_train['objects_list'] = df_train['objects_list'].apply(lambda x: literal_eval(x))

df_test = pd.read_csv('llm_text_generation_test_df.csv')
df_test['objects_list'] = df_test['objects_list'].apply(lambda x: literal_eval(x))

Define a function to form the input prompt for LLM Text generation model using a combination of short captions and object list

In [None]:
def generate_llm_input(objects_list, short_caption):
  object_list = ', '.join(obj for obj in objects_list)
  llm_input = 'objects: ' + object_list + '; ' + 'caption: ' + short_caption
  return llm_input

In [None]:
df_train['llm_input'] = df_train.apply(lambda x: generate_llm_input(x['objects_list'], x['short_caption']), axis=1)
df_test['llm_input'] = df_test.apply(lambda x: generate_llm_input(x['objects_list'], x['short_caption']), axis=1)

In [None]:
df_train.iloc[0]['llm_input']

'objects: enter, bird, bottom, composition, framing, falcon, hand, one, background, foreground, left, line, standing, hunt, two, landscape, horseman; caption: Two horsemen, falcon, hunting birds, framed line at bottom.'

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
from datasets import Dataset
def create_dataset(df_caption):
  inputs = df_caption['llm_input'].tolist()
  targets = df_caption['description'].tolist()

  dataset_dict = {"input_text": inputs, "target_text": targets}

  # Create a huggingface dataset from dictionary
  dataset = Dataset.from_dict(dataset_dict)

  # Tokenize the data into 1-hot encoded values for both inputs and outputs
  def tokenize_and_encode(examples):
      inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=300, return_tensors="pt")
      targets = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=300, return_tensors="pt")
      print ("Dataset input shape", inputs["input_ids"].shape)
      print ("Dataset output shape", targets["input_ids"].shape)
      return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}

  dataset = dataset.map(tokenize_and_encode, batched=True)
  return dataset

In [None]:
train_data = create_dataset(df_train)
validation_data = create_dataset(df_test)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset input shape torch.Size([1000, 300])
Dataset output shape torch.Size([1000, 300])
Dataset input shape torch.Size([1000, 300])
Dataset output shape torch.Size([1000, 300])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset input shape torch.Size([100, 300])
Dataset output shape torch.Size([100, 300])


In [None]:
# Fine-tune the model
training_args = TrainingArguments(
    output_dir="bart_large_cnn_checkpoints",          # output directory
    num_train_epochs=3,              # total number of training epochs
    label_names=["labels"],
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=50,                  # number of updates steps before checkpoint saves
    save_total_limit=1,         # limit the total amount of saved checkpoints
    logging_steps = 10          #print losses after 10 steps
    )

trainer = Trainer(
    model=model,                          # the instantiated 🤗 Transformers model to be trained
    args=training_args,                   # training arguments, defined above
    train_dataset= train_data,       # training dataset
    eval_dataset = validation_data
)

trainer.train()

# Save the model after training
model_path = "bart_large_cnn_final_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Step,Training Loss,Validation Loss
10,6.572,2.01929
20,1.4543,0.727305
30,0.7111,0.485032
40,0.4915,0.41281
50,0.4192,0.377641
60,0.4143,0.36502
70,0.3648,0.356364
80,0.3626,0.335287
90,0.3369,0.336674
100,0.3557,0.316907


Step,Training Loss,Validation Loss
10,6.572,2.01929
20,1.4543,0.727305
30,0.7111,0.485032
40,0.4915,0.41281
50,0.4192,0.377641
60,0.4143,0.36502
70,0.3648,0.356364
80,0.3626,0.335287
90,0.3369,0.336674
100,0.3557,0.316907


('/content/drive/MyDrive/DLMM_Final_Project/bart_large_cnn_final_model/tokenizer_config.json',
 '/content/drive/MyDrive/DLMM_Final_Project/bart_large_cnn_final_model/special_tokens_map.json',
 '/content/drive/MyDrive/DLMM_Final_Project/bart_large_cnn_final_model/vocab.json',
 '/content/drive/MyDrive/DLMM_Final_Project/bart_large_cnn_final_model/merges.txt',
 '/content/drive/MyDrive/DLMM_Final_Project/bart_large_cnn_final_model/added_tokens.json')

## Getting the list of objects (using ResNet50 - torchvision) and short captions (using BLIP) to generate the captions for real images

In [None]:
import pandas as pd

df_test = pd.read_csv('cooperhewitt-image-dataset/ids_test.csv')
files = [f"cooperhewitt-image-dataset/media/{media_id}.jpg" for media_id in df_test['media_id'].to_list()]
descriptions = df_test['description'].to_list()

In [None]:
test_images_files = files # specify any indexed data because of RAM issues
test_long_captions = descriptions # specify any indexed data because of RAM issues

In [None]:
from pycocotools.coco import COCO
annfile = 'annotations/instances_val2017.json'
coco = COCO(annfile)

loading annotations into memory...
Done (t=0.63s)
creating index...
index created!


Open all the image files using PIL

In [None]:
image_list = []
from PIL import Image

for image in test_images_files:
  image_list.append(Image.open(image).resize((200, 200)))

Generate the captions of all the images

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

input = processor(image_list, return_tensors="pt").to(device)

out = model.generate(**input)

short_captions_list = processor.batch_decode(out, skip_special_tokens=True)



Convert all the raw image files to tensor representations

In [None]:
from torchvision.transforms.functional import pil_to_tensor
image_tensor = []
for image in image_list:
  image_tensor.append(pil_to_tensor(image).unsqueeze(dim=0)/255.0)

In [None]:
del image_list

Loading the `retinanet-resenet50-fpn` model

In [None]:
from torchvision.models.detection import retinanet_resnet50_fpn

object_detection_model = retinanet_resnet50_fpn(pretrained=True, progress=False)
object_detection_model.eval(); ## Setting Model for Evaluation/Prediction



Find the objects list of each image using the `retinanet-resnet50-fpn` model

In [None]:
objects_list = []
count = 0
for image in image_tensor:
  image_preds = object_detection_model(image)
  image_labels = coco.loadCats(image_preds[0]['labels'].numpy())
  object_list = [label['name'] for label in image_labels]
  object_list = list(set(object_list))
  object_list = ', '.join(obj for obj in object_list)
  objects_list.append(object_list)
  count = count + 1
  print(count)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


Store the generated short captions and object list

In [None]:
import pickle

with open('llm_text_generation_predict_objects_list.pkl', 'wb') as f:
  pickle.dump(objects_list, f)

with open('llm_text_generation_predict_short_captions_list.pkl', 'wb') as f:
  pickle.dump(short_captions_list, f)

In [None]:
with open('llm_text_generation_predict_objects_list.pkl', 'rb') as f:
  objects_list = pickle.load(f)

with open('llm_text_generation_predict_short_captions_list.pkl', 'rb') as f:
  short_captions_list = pickle.load(f)

NameError: ignored

Forming the prompt to feed into the pre-trained model

In [None]:
llm_inputs = []
for i in range(len(objects_list)):
  llm_input = 'objects: ' + objects_list[i] + '; ' + 'caption: ' + short_captions_list[i]
  llm_inputs.append(llm_input)

In [None]:
llm_inputs[0]

'objects: dining table, wine glass, vase, bowl, cake, spoon, cup, bottle, sandwich; caption: a small cup and saucer'

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import pipeline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = "bart_large_cnn_final_model"

# Example usage of the saved model for evaluation
model = BartForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = BartTokenizer.from_pretrained(model_path)

longcaption = pipeline(task = "summarization", model = model, tokenizer = tokenizer, device=0)
tokenizer_kwargs = {'truncation':True,'max_length':300}

outputs = longcaption(llm_inputs,**tokenizer_kwargs)

Your max_length is set to 300, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your max_length is set to 300, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 300, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Your max_length is set to 300, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your

In [None]:
with open('llm_text_generation_prediction_outputs.pkl', 'wb') as f:
  pickle.dump(outputs, f)

In [None]:
with open('llm_text_generation_prediction_outputs.pkl', 'rb') as f:
  outputs = pickle.load(f)

In [None]:
pred_long_captions = [output['summary_text'] for output in outputs]

Calculating the Rouge Score between predicted and actual long captions

In [None]:
from datasets import load_metric
rouge = load_metric("rouge")

actual_captions_split = [caption.split() for caption in test_long_captions]
predicted_captions_split = [caption.split() for caption in pred_long_captions]

rouge_results = rouge.compute(predictions=predicted_captions_split, references=actual_captions_split)
rouge_results

  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

{'rouge1': AggregateScore(low=Score(precision=0.12486008536607278, recall=0.1674232131709424, fmeasure=0.13822217136904041), mid=Score(precision=0.13748604146868498, recall=0.18069757344353, fmeasure=0.1496906961619332), high=Score(precision=0.1506209322621803, recall=0.19479302115048627, fmeasure=0.16139330355287948)),
 'rouge2': AggregateScore(low=Score(precision=0.009419001186073186, recall=0.012403532877804298, fmeasure=0.010361730668956728), mid=Score(precision=0.01279623808747482, recall=0.016492913366203666, fmeasure=0.013790604234583786), high=Score(precision=0.016255091004171277, recall=0.021068851193772713, fmeasure=0.017422094492447127)),
 'rougeL': AggregateScore(low=Score(precision=0.09880426381877735, recall=0.13318158964918383, fmeasure=0.10958080502925897), mid=Score(precision=0.1081332879374192, recall=0.14391277432771354, fmeasure=0.11817387003913621), high=Score(precision=0.11777707320283391, recall=0.15505910182764707, fmeasure=0.12699085144783623)),
 'rougeLsum': A

#### Computing Sentence similarity using BERT based sentence vectors

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = SentenceTransformer('bert-base-uncased')

.gitattributes:   0%|          | 0.00/491 [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

(…)kage/Data/com.apple.CoreML/model.mlmodel:   0%|          | 0.00/165k [00:00<?, ?B/s]

weight.bin:   0%|          | 0.00/532M [00:00<?, ?B/s]

(…)sk/float32_model.mlpackage/Manifest.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/532M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [None]:
actual_captions = [caption.lower() for caption in test_long_captions]
predicted_captions = [caption.lower() for caption in pred_long_captions]

In [None]:
actual_caption_encodings =  model.encode(actual_captions)
predicted_caption_encodings =  model.encode(predicted_captions)

In [None]:
similarity = []
for i in range(len(actual_captions)):
  sent_similarity = cosine_similarity(actual_caption_encodings[i].reshape(1,-1), predicted_caption_encodings[i].reshape(1,-1))
  similarity.append(sent_similarity[0][0])

In [None]:
import numpy as np
print(f"Bert Base Uncased Sentence Similarity: {np.mean(similarity)}")

Bert Base Uncased Sentence Similarity: 0.7098969221115112


#### Computing Sentence similarity using Wordvec

In [None]:
import gensim.downloader as api

# Load the pre-trained Word2Vec model

# Download a pre-trained word2vec (trained on Google News data)
w2v_model = api.load("word2vec-google-news-300")


In [None]:
# Function to extract sentence vector from word vectors using mean
from scipy.spatial.distance import cosine

def extract_sentence_vector(sentence):
    words = sentence.split()
    word_vectors = [w2v_model[word] for word in words if word in w2v_model]
    if not word_vectors:
        return None  # Return None if no word vectors are found
    sentence_vector = np.mean(word_vectors, axis=0)
    return sentence_vector

In [None]:
actual_captions = [caption.lower() for caption in test_long_captions]
predicted_captions = [caption.lower() for caption in pred_long_captions]

In [None]:
actual_caption_encodings =  [extract_sentence_vector(caption) for caption in actual_captions]
predicted_caption_encodings =  [extract_sentence_vector(caption) for caption in predicted_captions]

In [None]:
def similarity (x1, x2):
  # similarity is the opposite of distance
  return 1 - cosine(x1, x2)

In [None]:
similarity(actual_caption_encodings[0], predicted_caption_encodings[0])

0.7230507135391235

In [None]:
similarity_score = []
for i in range(len(actual_captions)):
  sent_similarity = similarity(actual_caption_encodings[i], predicted_caption_encodings[i])
  similarity_score.append(sent_similarity)

In [None]:
import numpy as np
print(f"Word2Vec Sentence Similarity: {np.mean(similarity_score)}")

Word2Vec Sentence Similarity: 0.6026878106594086


### Calculating the average length of captions

In [None]:
actual_caption_len = [len(split_caption) for split_caption in  actual_captions_split]
print(f"Actual Caption Length: {np.mean(actual_caption_len)}")

predicted_caption_len = [len(split_caption) for split_caption in  predicted_captions_split]
print(f"Predicted Caption Length: {np.mean(predicted_caption_len)}")

Actual Caption Length: 35.84
Predicted Caption Length: 51.38
