In [58]:
!pip install youtube_transcript_api



In [65]:
import youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
nltk.download('punkt_tab')
import re
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [66]:
link = "https://www.youtube.com/watch?v=Y8Tko2YC5hA"
unique_id = link.split("=")[-1]
sub = YouTubeTranscriptApi.get_transcript(unique_id)
subtitle = " ".join([x['text'] for x in sub])

In [68]:
from nltk.tokenize import sent_tokenize
subtitle = subtitle.replace("n","")
sentences = sent_tokenize(subtitle)

In [69]:
organized_sent = {k:v for v,k in enumerate(sentences)}
tf_idf = TfidfVectorizer(min_df=2,
                                    strip_accents='unicode',
                                    max_features=None,
                                    lowercase = True,
                                    token_pattern=r'w{1,}',
                                    ngram_range=(1, 3),
                                    use_idf=True,
                                    smooth_idf=True,
                                    sublinear_tf=True,
                                    stop_words = 'english')

In [70]:
import numpy as np

sentence_vectors = tf_idf.fit_transform(sentences)
sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel()


In [72]:
N = 3
top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N]]

In [73]:
# mapping the scored sentences with their indexes as in the subtitle
mapped_sentences = [(sentence,organized_sent[sentence]) for sentence in top_n_sentences]
# Ordering the top-n sentences in their original order
mapped_sentences = sorted(mapped_sentences, key = lambda x: x[1])
ordered_sentences = [element[0] for element in mapped_sentences]
# joining the ordered sentence
summary = " ".join(ordered_sentences)


In [74]:
summary

"You ca also use Pytho to build  web, mobile ad desktop applicatios as well as software  testig or eve hackig. Let's say we wat to extract the first three  letters of the text Hello World. It's cross platform which meas  you ca build ad ru Pytho applicatios o Widows, Mac,  ad Liux."

In [75]:
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

In [76]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # Checks if GPU support is enabled

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')


2.5.1+cu124
False


In [77]:
input_tensor = tokenizer.encode( subtitle, return_tensors="pt", max_length=512)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [78]:
outputs_tensor = model.generate(input_tensor, max_length=160, min_length=120, length_penalty=2.0, num_beams=4, early_stopping=True)
outputs_tensor

tensor([[    2,     0,   510, 20436,   139,    16,     5,  1437,   232,    18,
          6273,  1733,  1023,  2329,   144,  1406,   586,   119,  1023,   784,
         11993,  1580,     4,    85,    16,   341,    30,  2257,  5485,     6,
         45324,  5003,     6,   414,    10,  7776,  5019,     6,  2850,  5810,
          1952,     6,  7678,   995,  2923,     6,  4400,  6014,  1023,   364,
         25004,   268,     6,  2329, 15330,  1159,     4,   590, 46439,   139,
            47,    64,  6136,  2632,  1272,   939,   540,    86,    19,  4163,
          5738,     9,  3260,     4,   370,    64,    67,   304, 46439,   139,
             7,  1119,  1437,  3748,     6,  1830,  2329, 14050, 40967,   415,
          4544,    25,   157,    25,  2257,  1437,  1296,  1023,    50, 15330,
         14157,  1023,     4,    85,    18,    10,   182, 21422,   906, 16708,
           352, 28644,   119,  1023,  1437,   784, 11993,  3443,     4,   407,
            82,    31, 47850,    90, 41352,   918,  

In [79]:
print(tokenizer.decode(outputs_tensor[0]))

</s><s>Pytho is the  world's fastest growig ad most popular programmig laguage. It is used by software developers, mathematicias, data aalysts, scietists, accoutats, etworkig egieers, ad eve kids. With Pytho you can solve complex problems i less time with fewer lies of code. You can also use Pytho to build  web, mobile ad desktop applicatios as well as software  testig or eve hackig. It's a very begier friedlyprogrammig  laguages. So people from differet disciplies use it for a variety of tasks.</s>


In [80]:
import textwrap
summary =tokenizer.decode(outputs_tensor[0])
wrapped_text = textwrap.fill(summary, width=50)  # 50 characters per line
print(wrapped_text)

</s><s>Pytho is the  world's fastest growig ad
most popular programmig laguage. It is used by
software developers, mathematicias, data aalysts,
scietists, accoutats, etworkig egieers, ad eve
kids. With Pytho you can solve complex problems i
less time with fewer lies of code. You can also
use Pytho to build  web, mobile ad desktop
applicatios as well as software  testig or eve
hackig. It's a very begier friedlyprogrammig
laguages. So people from differet disciplies use
it for a variety of tasks.</s>


In [29]:
from transformers import pipeline

In [30]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu
