In [1]:
!pip install transformers
!pip install langchain
!pip install datasets
!pip install torch torchvision torchaudio
from datasets import load_dataset


Collecting langchain
  Downloading langchain-0.2.0-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.0-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.9/307.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.0-py3-none-any.whl (23 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.59-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.2/121.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=

In [None]:
data_files = {
    "train": "XsumTrain.csv",
    "test": "XsumTest.csv",
    "validation": "XsumValidation.csv"
}

dataset = load_dataset('csv', data_files=data_files)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")

def preprocess_function(examples):
    inputs = examples['document']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [None]:
from transformers import Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig

# Load the model
model = transformers.AutoModelForCausalLM.from_pretrained(
  'tiiuae/falcon-7b-instruct',
  trust_remote_code=True
)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,   # Reduced batch size
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=10_000,
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=8,  # Accumulate gradients over 8 steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

# Train the model
trainer.train()


In [None]:
from transformers import pipeline
from langchain import HuggingFacePipeline, PromptTemplate,  LLMChain


# Set up the pipeline for text generation
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, max_new_tokens=100, do_sample=True, use_cache=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id)

# Wrap the pipeline in HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0.1})

# Define the prompt template
template = """
Write a concise summary of the following text delimited by triple backquotes.
```{text}```
SUMMARY:
"""

prompt = PromptTemplate(template=template, input_variables=["text"])

# Create LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)


In [None]:
from transformers import pipeline
from langchain import HuggingFacePipeline, PromptTemplate,  LLMChain


# Set up the pipeline for text generation
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, max_new_tokens=100, do_sample=True, use_cache=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id)

# Wrap the pipeline in HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0.1})

# Define the prompt template
template = """
Write a concise summary of the following text delimited by triple backquotes.
```{text}```
SUMMARY:
"""

prompt = PromptTemplate(template=template, input_variables=["text"])

# Create LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)


Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting regex!=2019.12.17
  Downloading regex-2024.5.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (774 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.1/774.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (705 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m705.5/705.5 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollec



config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

configuration_falcon.py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

  warn_deprecated(


In [None]:
df = pd.DataFrame(dataset['test'])[['document', 'summary','id']]

In [None]:
df.head()

Unnamed: 0,document,summary,id
0,Prison Link Cymru had 1099 referrals in 201516...,"There is a ""chronic"" need for more housing for...",38264402
1,Officers searched properties in the Waterfront...,"A man has appeared in court after firearms, am...",34227252
2,Jordan Hill Brittany Covington and Tesfaye Coo...,Four people accused of kidnapping and torturin...,38537698
3,The 48yearold former Arsenal goalkeeper played...,West Brom have appointed Nicky Hammond as tech...,36175342
4,Restoring the function of the organ which hel...,The pancreas can be triggered to regenerate it...,39070183


In [None]:
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,document,summary
0,Prison Link Cymru had 1099 referrals in 201516...,"There is a ""chronic"" need for more housing for..."
1,Officers searched properties in the Waterfront...,"A man has appeared in court after firearms, am..."
2,Jordan Hill Brittany Covington and Tesfaye Coo...,Four people accused of kidnapping and torturin...
3,The 48yearold former Arsenal goalkeeper played...,West Brom have appointed Nicky Hammond as tech...
4,Restoring the function of the organ which hel...,The pancreas can be triggered to regenerate it...


In [None]:
# Create an empty column 'model_generated' in test_df to store the generated summaries
df['model_generated'] = ""

# Define a function to generate summaries and populate the 'model_generated' column
def generate_and_store_summary(row):
    article_text = row['document']
    summary = llm_chain.run(article_text)
    return summary

In [None]:
df['model_generated'] = df.apply(generate_and_store_summary, axis=1)

In [None]:
print(df[['document', 'model_generated']].head(25))

                                             document                                    model_generated
0   Prison Link Cymru had 1099 referrals in 201516...  \n              Write a concise summary of the...
1   Officers searched properties in the Waterfront...  \n              Write a concise summary of the...
2   Jordan Hill Brittany Covington and Tesfaye Coo...  \n              Write a concise summary of the...
3   The 48yearold former Arsenal goalkeeper played...  \n              Write a concise summary of the...
4   Restoring the function of the organ  which hel...  \n              Write a concise summary of the...
5   But there certainly should be.\nThese are two ...  \n              Write a concise summary of the...
6   Media playback is not supported on this device...  \n              Write a concise summary of the...
7   Its no joke. But Kareem Badr says people did l...  \n              Write a concise summary of the...
8   Relieved that the giant telecoms company would...  

In [None]:
# Apply the logic to all rows of the 'model_generated' column
df['model_generated'] = df['model_generated'].apply(lambda text: text.split('SUMMARY:')[1].strip() if 'SUMMARY:' in text else text)

In [None]:
df['model_generated'][7]

'This article is about The art of public speaking. It discusses the 5 most important tips for aspiring presenters and their first tip is to practice your public speaking skills regularly and take on the challenge of improving you skills.\n1. Practice your speech as much as possible.\n2. Get feedback on your speeches.\n3. Understand your audiences needs.\n4. Engage your audience.\n5. Use media to your advantages.\n\nThe first tip for public speaking is to practice your presentation'

In [None]:
df['document'][7]

'Its no joke. But Kareem Badr says people did laugh in 2009 when he and two friends paid 20000 Â13000 for the Hideout in Austin when it wasnt making money and the previous owner decided not to renew the lease.\nWe took over a sinking ship and each brought a bucket to bail it out says Mr Badr.\nNone of us had any experience of running a business. But we loved what we were doing enough that it carried us through.\nThree years ago he was able to quit his day job and draw a salary from the club.\nMr Badr says its still not as much as he used to make as a programmer about 80000 a year but he now employs around 25 part time and contract workers.\nAnd he recently expanded the premises taking over the adjoining coffee house which sells alcohol and leasing more theatre space.\nMr Badr says I think my background in computer science helped because I can take a big problem break it up into small chunks and figure out how to make it better and more efficient.\nThats basically what we did for every 

In [5]:
!pip install rouge
!pip install nltk
!pip install bert_score

from rouge import Rouge

# Initialize the ROUGE evaluator
rouge = Rouge()

sampled_df = df

generated_summaries = sampled_df['model_generated'].tolist()
reference_summaries = sampled_df['summary'].tolist()

# Calculate ROUGE scores
rouge_scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)

from nltk.translate.bleu_score import corpus_bleu

# Extract the generated summaries and reference summaries
generated_summaries = sampled_df['model_generated'].tolist()
reference_summaries = sampled_df['summary'].tolist()

# Calculate BLEU score
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

from bert_score import score

# Extract the generated summaries and reference summaries
generated_summaries = sampled_df['model_generated'].tolist()
reference_summaries = sampled_df['summary'].tolist()

# Calculate BERT Score
P, R, F1 = score(generated_summaries, reference_summaries, lang="en", verbose=True)




The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 108.91 seconds, 0.23 sentences/sec


In [6]:
print("ROUGE Scores:", rouge_scores)
print("BLEU Score Summaries:", bleu_score)
print("BERT Precision:", P.mean().item())
print("BERT Recall:", R.mean().item())
print("BERT F1 Score:", F1.mean().item())

ROUGE Scores: {'rouge-1': {'r': 0.27325732278071757, 'p': 0.10613480343655533, 'f': 0.15023619119652162}, 'rouge-2': {'r': 0.027487335987026393, 'p': 0.00971654869268867, 'f': 0.013988781536672719}, 'rouge-l': {'r': 0.22933039710024503, 'p': 0.08878675389736798, 'f': 0.12569667271746987}}
BLEU Score Summaries: 9.212320578498223e-232
BERT Precision: 0.7847375273704529
BERT Recall: 0.8650155663490295
BERT F1 Score: 0.8219874501228333
