In [1]:
!pip install transformers
!pip install langchain
!pip install datasets
!pip install torch torchvision torchaudio
from datasets import load_dataset


Collecting langchain
  Downloading langchain-0.2.0-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting langchain-cor

In [None]:
data_files = {
    "train": "CNNTrain.csv",
    "test": "CNNTest.csv",
    "validation": "CNNValidation.csv"
}

dataset = load_dataset('csv', data_files=data_files)

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")

def preprocess_function(examples):
    inputs = examples['article']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)




tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig

# Load the model
model = transformers.AutoModelForCausalLM.from_pretrained(
  'tiiuae/falcon-7b-instruct',
  trust_remote_code=True
)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,   # Reduced batch size
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=10_000,
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=8,  # Accumulate gradients over 8 steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

# Train the model
trainer.train()


In [None]:
from transformers import pipeline
from langchain import HuggingFacePipeline, PromptTemplate,  LLMChain


# Set up the pipeline for text generation
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, max_new_tokens=100, do_sample=True, use_cache=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id)

# Wrap the pipeline in HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0.1})

# Define the prompt template
template = """
Write a concise summary of the following text delimited by triple backquotes.
```{text}```
SUMMARY:
"""

prompt = PromptTemplate(template=template, input_variables=["text"])

# Create LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)


config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

configuration_falcon.py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

  self.pid = os.fork()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [None]:
df = pd.DataFrame(dataset['test'])[['article', 'highlights','id']]

In [None]:
df.head()

Unnamed: 0,article,highlights,id
0,CNNThe Palestinian Authority officially became...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01
1,CNNNever mind cats having nine lives. A stray ...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef
2,CNNIf youve been following the news lately the...,Mohammad Javad Zarif has spent more time with ...,4495ba8f3a340d97a9df1476f8a35502bcce1f69
3,CNNFive Americans who were monitored for three...,17 Americans were exposed to the Ebola virus w...,a38e72fed88684ec8d60dd5856282e999dc8c0ca
4,CNNA Duke student has admitted to hanging a no...,Student is no longer on Duke University campus...,c27cf1b136cc270023de959e7ab24638021bc43f


In [None]:
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,article,highlights
0,CNNThe Palestinian Authority officially became...,Membership gives the ICC jurisdiction over all...
1,CNNNever mind cats having nine lives. A stray ...,"Theia, a bully breed mix, was apparently hit b..."
2,CNNIf youve been following the news lately the...,Mohammad Javad Zarif has spent more time with ...
3,CNNFive Americans who were monitored for three...,17 Americans were exposed to the Ebola virus w...
4,CNNA Duke student has admitted to hanging a no...,Student is no longer on Duke University campus...


In [None]:
df['model_generated'] = ""

# Define a function to generate summaries and populate the 'model_generated' column
def generate_and_store_summary(row):
    article_text = row['article']
    summary = llm_chain.run(article_text)
    return summary

In [None]:
df['model_generated'] = df.apply(generate_and_store_summary, axis=1)

In [None]:
# Apply the logic to all rows of the 'model_generated' column
df['model_generated'] = df['model_generated'].apply(lambda text: text.split('SUMMARY:')[1].strip() if 'SUMMARY:' in text else text)

In [5]:
print(df[['article', 'model_generated']].head(10))

                                             article  \
0  CNNThe Palestinian Authority officially became...   
1  CNNNever mind cats having nine lives. A stray ...   
2  CNNIf youve been following the news lately the...   
3  CNNFive Americans who were monitored for three...   
4  CNNA Duke student has admitted to hanging a no...   
5  CNNHes a blue chip college basketball recruit....   
6  CNNGovernments around the world are using the ...   
7  CNNAndrew Getty one of the heirs to billions o...   
8  CNNFilipinos are being warned to be on guard f...   
9  CNNFor the first time in eight years a TV lege...   

                                     model_generated  
0  International Criminal Court has now 123 membe...  
1  The dog, Theia, who was thought dead after bei...  
2  Mr. Ambassador: A memoir of one Iranian in a p...  
3  - 4+ Americans who had been quarantined in Oma...  
4  - College student with noose “hangs” it near c...  
5  In a high school gym, a special needs girl nam... 

In [9]:
df['article'][8]

'CNNFilipinos are being warned to be on guard for flash floods and landslides as tropical storm Maysak approached the Asian island nation Saturday. Just a few days ago Maysak gained super typhoon status thanks to its sustained 150 mph winds. It has since lost a lot of steam as it has spun west in the Pacific Ocean. Its now classified as a tropical storm according to the Philippine national weather service which calls it a different name Chedeng. It boasts steady winds of more than 70 mph 115 kph and gusts up to 90 mph as of 5 p.m. 5 a.m. ET Saturday. Still that doesnt mean Maysak wont pack a wallop. Authorities took preemptive steps to keep people safe such as barring outdoor activities like swimming surfing diving and boating in some locales as well as a number of precautionary evacuations. Gabriel Llave a disaster official told PNA that tourists who arrive Saturday in and around the coastal town of Aurora will not be accepted by the owners of hotels resorts inns and the like ... and 

In [10]:
df['model_generated'][8]

'A super typhoon moving west brought serious concern to Filipinos Sunday morning. Authorities took necessary precautions with evacuations, no outdoor activities, and a specific hotel policy.'

In [14]:
!pip install rouge
!pip install nltk
!pip install bert_score

from rouge import Rouge

# Initialize the ROUGE evaluator
rouge = Rouge()

sampled_df = df.dropna(subset=['model_generated', 'highlights'])

# Extract the generated summaries and reference summaries for the selected samples
generated_summaries = sampled_df['model_generated'].tolist()
reference_summaries = sampled_df['highlights'].tolist()

# Calculate ROUGE scores for the selected samples
rouge_scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)

from nltk.translate.bleu_score import corpus_bleu


# Extract the generated summaries and reference summaries for the selected samples
generated_summaries = sampled_df['model_generated'].tolist()
reference_summaries = sampled_df['highlights'].tolist()

# Calculate BLEU score for the selected samples
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

from bert_score import score

# Extract the generated summaries and reference summaries for the selected samples
generated_summaries = sampled_df['model_generated'].tolist()
reference_summaries = sampled_df['highlights'].tolist()

# Calculate BERT Score
P, R, F1 = score(generated_summaries, reference_summaries, lang="en", verbose=True)




The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

  self.pid = os.fork()
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.72 seconds, 13.85 sentences/sec


In [15]:
# Print BERT Score
print("ROUGE Scores:", rouge_scores)
print("BLEU Score for Summaries:", bleu_score)
print("BERT Precision:", P.mean().item())
print("BERT Recall:", R.mean().item())
print("BERT F1 Score:", F1.mean().item())

ROUGE Scores: {'rouge-1': {'r': 0.2748651562180974, 'p': 0.17366179037714172, 'f': 0.2068370010676258}, 'rouge-2': {'r': 0.04749552615406274, 'p': 0.025647032783723333, 'f': 0.03211648712033251}, 'rouge-l': {'r': 0.23505825688178633, 'p': 0.14947103638238043, 'f': 0.17777519303265638}}
BLEU Score for Summaries: 9.435647963507199e-232
BERT Precision: 0.8139855265617371
BERT Recall: 0.8588894009590149
BERT F1 Score: 0.835396945476532
