In [4]:
! pip install datasets
! pip install -U accelerate
! pip install -U transformers[torch]
! pip install rouge_score
! pip install rouge
! pip install bert-extractive-summarizer
! pip install streamlit
! pip install youtube-transcript-api

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     

In [5]:
from datasets import load_dataset, load_metric, Dataset
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, LEDTokenizer, LEDForConditionalGeneration
import accelerate
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
from rouge import Rouge

In [6]:
dataset = load_dataset("potsawee/podcast_summary_assessment")

Downloading readme:   0%|          | 0.00/982 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating evaluation split:   0%|          | 0/3580 [00:00<?, ? examples/s]

In [7]:
dataset

DatasetDict({
    evaluation: Dataset({
        features: ['transcript', 'summary', 'score', 'attributes', 'episode_id', 'system_id'],
        num_rows: 3580
    })
})

In [8]:
dataset = dataset['evaluation'].map(lambda example: {'transcript': example['transcript'], 'summary': example['summary']})

temp_df = pd.DataFrame(dataset)
df = pd.DataFrame()

df = temp_df[['transcript', 'summary']]
df = df.sample(n=500, random_state=42)

train_temp_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
validation_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

Map:   0%|          | 0/3580 [00:00<?, ? examples/s]

In [9]:
train_dataset = Dataset.from_pandas(train_temp_data)
val_dataset = Dataset.from_pandas(validation_data)
test_dataset = Dataset.from_pandas(test_data)

In [10]:
#train_dataset = load_dataset('bakhitovd/data_science_arxiv', split='train')
#val_dataset = load_dataset('bakhitovd/data_science_arxiv', split='validation')
#test_dataset = load_dataset('bakhitovd/data_science_arxiv', split='test')

In [11]:
#test_data = load_dataset('bakhitovd/data_science_arxiv', split='test')
#test_data

In [12]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [13]:
# set generate hyperparameters
led.config.num_beams = 2
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [14]:
max_input_length = 7168 # it is calculated
max_output_length = 512
batch_size = 1

In [15]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["transcript"],
        padding='max_length',
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["summary"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [16]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [17]:
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["transcript", "summary"],
)

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

val_dataset = val_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["transcript", "summary"],
)

val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [18]:
rouge = load_metric("rouge")

  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [19]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    output_dir="./",
    logging_steps=250,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
)

In [20]:
trainer = Seq2SeqTrainer(
    model=led,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [21]:
trainer.train()

You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=6, training_loss=2.8768091201782227, metrics={'train_runtime': 49.5944, 'train_samples_per_second': 0.484, 'train_steps_per_second': 0.121, 'total_flos': 113408603062272.0, 'train_loss': 2.8768091201782227, 'epoch': 3.0})

In [22]:
led.save_pretrained("LED_7k_epoch_3")

In [None]:
'''
tokenizer = LEDTokenizer.from_pretrained("patrickvonplaten/led-large-16384-pubmed")
model = LEDForConditionalGeneration.from_pretrained("patrickvonplaten/led-large-16384-pubmed").to("cuda").half()


def generate_answer(batch):
  inputs_dict = tokenizer(batch["transcript"], padding="max_length", max_length=8192, return_tensors="pt", truncation=True)
  input_ids = inputs_dict.input_ids.to("cuda")
  attention_mask = inputs_dict.attention_mask.to("cuda")
  global_attention_mask = torch.zeros_like(attention_mask)
  # put global attention on <s> token
  global_attention_mask[:, 0] = 1

  predicted_abstract_ids = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
  batch["predicted_summary"] = tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
  return batch


result = test_dataset.map(generate_answer, batched=True, batch_size=4)

# load rouge
rouge = load_metric("rouge")

print("Result:", rouge.compute(predictions=result["predicted_summary"], references=result["summary"], rouge_types=["rouge2"])["rouge2"].mid)
'''

'\ntokenizer = LEDTokenizer.from_pretrained("patrickvonplaten/led-large-16384-pubmed")\nmodel = LEDForConditionalGeneration.from_pretrained("patrickvonplaten/led-large-16384-pubmed").to("cuda").half()\n\n\ndef generate_answer(batch):\n  inputs_dict = tokenizer(batch["transcript"], padding="max_length", max_length=8192, return_tensors="pt", truncation=True)\n  input_ids = inputs_dict.input_ids.to("cuda")\n  attention_mask = inputs_dict.attention_mask.to("cuda")\n  global_attention_mask = torch.zeros_like(attention_mask)\n  # put global attention on <s> token\n  global_attention_mask[:, 0] = 1\n\n  predicted_abstract_ids = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)\n  batch["predicted_summary"] = tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)\n  return batch\n\n\nresult = test_dataset.map(generate_answer, batched=True, batch_size=4)\n\n# load rouge\nrouge = load_metric("rouge")\n\nprint("Result:", rouge.

In [23]:
def summarize(text):
    inputs_dict = tokenizer(text, padding="max_length", max_length=16384, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids.to("cuda")
    attention_mask = inputs_dict.attention_mask.to("cuda")
    global_attention_mask = torch.zeros_like(attention_mask)
    # put global attention on <s> token
    global_attention_mask[:, 0] = 1
    predicted_abstract_ids = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length=512)
    return tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)

In [25]:
exemple = test_dataset['transcript'][0]
print(exemple)

Hey guys, what's up? And welcome back to the art of cooking babka last week. I went to Munich and it was a great weekend. I spend the weekend there to do a keynote talk also was a part of a panel and yeah had a good time in general so good to be back right now and let's start off with a brand new podcast.This time I've invited Matt angleman and he says it's not the first time that he's in this podcast. It's actually the second I guess he is the I think he's the vice president of Atlantic Records. But yeah, I'm not running 100% sure, but I think he is but he's working at Atlantic Records and he's a genius when it comes down to the music industry. So he's a wise man with a lot of value to offer so It's why I decided to invite him again and go over the world of streaming for this time. And we ended up talking a lot about what you can do as an artist to get signed to a label like Atlantic Records because it seems impossible but there still is an opportunity for you guys. So let's get start

In [26]:
summary = summarize(exemple)
print('\n', summary[0])




 This is the first time I've shared a podcast with Matt Anglemann. I hope you enjoy!This is my first time sharing a podcast. I've been in the business for over a year and a half. I spent the weekend in Munich and it was a great weekend. I spend the weekend there to do a keynote talk also was a part of a panel and yeah had a good time in general so I think at the end of the day creativity is going to be what drives The future of the business at the artists that are more creative are going to continue to thrive because you know, it's you have to separate yourself from the rest of the music industry. You know, you can always count on me to help you build your own music. You can always reach out to your friends and family for help. I also hope that you enjoy listening to me talk about my experiences as an artist and how I got started in the music business. I mean, I hope that this podcast helps you make sense of the world of music and the future of your life. I think that it's interesting

In [None]:
led_predicted_summaries = []
for i in range (len(test_dataset['transcript'])):
  sum = summarize(test_dataset['transcript'][i])
  led_predicted_summaries.append(sum[0])

In [None]:
#print(led_predicted_summaries[0])

In [None]:
# Initialize ROUGE
rouge = Rouge()


# Compute ROUGE scores
#scores = rouge.get_scores(predicted_summaries, test_dataset['transcript'])

# Initialize variables to accumulate individual Rouge scores

def evaluate_summaries(predicted_summaries, target_summaries):

  scores = rouge.get_scores(predicted_summaries, target_summaries)

  rouge_1_r = rouge_1_p = rouge_1_f = 0
  rouge_2_r = rouge_2_p = rouge_2_f = 0
  rouge_l_r = rouge_l_p = rouge_l_f = 0

  # Loop through each Rouge score and accumulate the individual Rouge scores
  for score in scores:
      rouge_1_r += score['rouge-1']['r']
      rouge_1_p += score['rouge-1']['p']
      rouge_1_f += score['rouge-1']['f']

      rouge_2_r += score['rouge-2']['r']
      rouge_2_p += score['rouge-2']['p']
      rouge_2_f += score['rouge-2']['f']

      rouge_l_r += score['rouge-l']['r']
      rouge_l_p += score['rouge-l']['p']
      rouge_l_f += score['rouge-l']['f']

  # Calculate average Rouge scores
  total_summaries = len(scores)

  average_rouge_1_r = rouge_1_r / total_summaries
  average_rouge_1_p = rouge_1_p / total_summaries
  average_rouge_1_f = rouge_1_f / total_summaries

  average_rouge_2_r = rouge_2_r / total_summaries
  average_rouge_2_p = rouge_2_p / total_summaries
  average_rouge_2_f = rouge_2_f / total_summaries

  average_rouge_l_r = rouge_l_r / total_summaries
  average_rouge_l_p = rouge_l_p / total_summaries
  average_rouge_l_f = rouge_l_f / total_summaries

  # Display the average Rouge scores
  print("Rouge-1 Recall : ", average_rouge_1_r)
  print("Rouge-1 Precision : ", average_rouge_1_p)
  print("Rouge-1 F1 Score : ", average_rouge_1_f)

  print("Rouge-2 Recall : ", average_rouge_2_r)
  print("Rouge-2 Precision : ", average_rouge_2_p)
  print("Rouge-2 F1 Score : ", average_rouge_2_f)

  print("Rouge-L Recall : ", average_rouge_l_r)
  print("Rouge-L Precision : ", average_rouge_l_p)
  print("Rouge-L F1 Score : ", average_rouge_l_f)


In [None]:
evaluate_summaries(led_predicted_summaries, test_dataset['summary'])

Rouge-1 Recall :  0.42149869247081273
Rouge-1 Precision :  0.2714034593494864
Rouge-1 F1 Score :  0.2971811306449458
Rouge-2 Recall :  0.1404647471325547
Rouge-2 Precision :  0.08247997357966062
Rouge-2 F1 Score :  0.09191354959538951
Rouge-L Recall :  0.3622276168096147
Rouge-L Precision :  0.2412242703357908
Rouge-L F1 Score :  0.2599233613232824


#Comparaison avec d'autres modèles de résumé

In [30]:
#input_text = str(exemple)
import textwrap

input_text =  test_dataset['transcript'][0]
#input_text = textwrap.fill(input_text, width=150)
input_text = f'''{input_text}'''
print(input_text)

Hey guys, what's up? And welcome back to the art of cooking babka last week. I went to Munich and it was a great weekend. I spend the weekend there to do a keynote talk also was a part of a panel and yeah had a good time in general so good to be back right now and let's start off with a brand new podcast.This time I've invited Matt angleman and he says it's not the first time that he's in this podcast. It's actually the second I guess he is the I think he's the vice president of Atlantic Records. But yeah, I'm not running 100% sure, but I think he is but he's working at Atlantic Records and he's a genius when it comes down to the music industry. So he's a wise man with a lot of value to offer so It's why I decided to invite him again and go over the world of streaming for this time. And we ended up talking a lot about what you can do as an artist to get signed to a label like Atlantic Records because it seems impossible but there still is an opportunity for you guys. So let's get start

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration


bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Tokenize input text
input_ids = bart_tokenizer.encode(input_text, return_tensors="pt")

# Generate summary using BART
summary_ids = bart_model.generate(input_ids, max_length=1024, num_beams=4, length_penalty=2.0, early_stopping=True)
summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("BART Summary:", summary)

Token indices sequence length is longer than the specified maximum sequence length for this model (5838 > 1024). Running this sequence through the model will result in indexing errors


IndexError: ignored

In [None]:
from summarizer import TransformerSummarizer

GPT2_model3 = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-large")
Summary = ''.join(GPT2_model3(input_text, min_length=50))
print('\n', Summary)

In [None]:
GPT2_model2 = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")
Summary = ''.join(GPT2_model2(input_text, min_length=50))
print('\n', Summary)

In [None]:
pred_BART = []

for i in range(len(test_dataset['transcript'])):

  input_text = test_dataset['transcript'][i]
  input_ids = bart_tokenizer.encode(input_text, return_tensors="pt")
  summary_ids = bart_model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
  summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  pred_BART.append(summary)

evaluate_summaries(pred_BART, test_dataset['summary'])

In [None]:
pred_GPT_large = []

for i in range(len(test_dataset['transcript'])):

  input_text = test_dataset['transcript'][i]
  summary = ''.join(GPT2_model3(input_text, min_length=50))
  pred_GPT_large.append(summary)

evaluate_summaries(pred_GPT_large, test_dataset['summary'])

# Interface web pour YouTube

In [32]:
from youtube_transcript_api import YouTubeTranscriptApi


def get_transcript(video_url):
    try:
        video_id = video_url.split("v=")[1]
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = ""
        for entry in transcript:
            text += entry['text'] + " "
        return text
    except Exception as e:
        return str(e)

In [33]:
url  = str(input("Entrez l'url de la vidéo à résumer (en anglais) : "))

Entrez l'url de la vidéo à résumer (en anglais) : https://www.youtube.com/watch?v=gUmagAluXpk&ab_channel=BorisMeinardus


In [34]:
transcript = get_transcript(url)
sum = summarize(transcript)[0]
sum =  textwrap.fill(sum, width=150)
print('\n Résumé de la vidéo : \n', sum)


 Résumé de la vidéo : 
  goes through all the mths of back propagation and so on I cannot recommend this series enough since in Andrew NS and Andre kath's courses you already
get some practical experience with the taught ml Concepts I would then continue on to the next more advanced and practical course the Deep learning
specialization this course focuses more on implementing and training new Nets and the absolutely amazing thing here is that they also include hugging
face which is a library that you pretty much cannot avoid it's really amazing and if you feel like this course doesn't teach you enough about hugging
face you can also just go through the hugging face NLP course directly there


In [35]:
%%writefile transcript.py

import streamlit as st
import torch
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, LEDTokenizer, LEDForConditionalGeneration

def get_transcript(video_url):
    try:
        video_id = video_url.split("v=")[1]
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = ""
        for entry in transcript:
            text += entry['text'] + " "
        return text
    except Exception as e:
        return str(e)


tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)


def summarize(text):
    inputs_dict = tokenizer(text, padding="max_length", max_length=16384, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids.to('cpu')
    attention_mask = inputs_dict.attention_mask.to('cpu')
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1
    predicted_abstract_ids = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length=512)
    return tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)

def main():
    st.title("YouTube Video Summary Generator")

    # Get YouTube video URL from the user
    video_url = st.text_input("Enter YouTube Video URL:")

    if st.button("Generate Summary"):
        if video_url:
            st.info("Please wait. Generating transcript...")
            transcript = get_transcript(video_url)
            if transcript:
                st.success("Transcript Generated Successfully!")
                st.write("Transcript:")
                st.write(transcript)
                sum = summarize(transcript)
                if sum:
                  st.success("Summary generated successfully!")
                  st.write('Summary of the video: ')
                  st.write(sum)

            else:
                st.error("Error fetching transcript. Please check the video URL.")
        else:
            st.warning("Please enter a valid YouTube Video URL.")

if __name__ == "__main__":
    main()

Writing transcript.py


In [36]:
!npm install localtunnel

[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[34;40mnotice[0m[35m[0m created a lockfile as package-lock.json. You should commit this file.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
+ localtunnel@2.0.2
added 22 packages from 22 contributors and audited 22 packages in 3.694s

3 packages are looking for funding
  run `npm fund` for details

found 1 [93mmoderate[0m severity vulnerability
  run `npm audit fix` to fix them, or `npm audit` for details
[K[?25h

In [37]:
import urllib
print("Password/Enpoint IP for localtunnel is:",urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))

Password/Enpoint IP for localtunnel is: 34.142.236.28


In [38]:
!streamlit run transcript.py &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8501

[K[?25hnpx: installed 22 in 1.176s
your url is: https://eighty-kings-turn.loca.lt
