In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transcriptor import get_transcript
import math
import torch
import textwrap
from tqdm import tqdm
import re 


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
t = get_transcript('https://www.youtube.com/watch?v=u_6NUXKe65A')
print('Transcription Completed!') 

[youtube] Extracting URL: https://www.youtube.com/watch?v=u_6NUXKe65A
[youtube] u_6NUXKe65A: Downloading webpage
[youtube] u_6NUXKe65A: Downloading ios player API JSON
[youtube] u_6NUXKe65A: Downloading mweb player API JSON
[youtube] u_6NUXKe65A: Downloading m3u8 information
[info] u_6NUXKe65A: Downloading 1 format(s): 251
[download] Destination: audio.webm
[download] 100% of   14.63MiB in 00:00:01 at 11.02MiB/s    
[ExtractAudio] Destination: audio.mp3
Deleting original file audio.webm (pass -k to keep)


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Transcription Completed!


In [None]:
t_wrap = textwrap.fill(t, width=150)
print(t_wrap) 

 Hi, this is Matt Baker. Every now and then on this channel, I like to take a look at a really old chart rather than one that was designed on a
computer. Personally, I find these vintage charts beautiful and there's often things that we can still learn from them even if they are out of date.
So far, we've looked at Adam's Synchronological Chart of History, The Bird's Eye View of the Life of Christ, and The Histamap of Religion by John B.
Sparks. Well, today, since the US election is coming up, I thought I'd take a look at this timeline of US political parties. It was published back in
1880 as a fold-out chart in this book called Conspectus of the History of Political Parties and the Federal Government by Walter R. Houghton of
Indiana University. Of course, nowadays US politics is dominated by the Republicans and and the Democrats. But this was not always the case. Prior to
the emergence of the current parties, there were actually several other older parties such as the Federalists, the

In [None]:
# 'facebook/bart-large-cnn'
# 'knkarthick/MEETING_SUMMARY'
# 'suriya7/bart-finetuned-text-summarization' 


def summarize(transcript, percent_len=.5):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # model_name = 'sshleifer/distilbart-cnn-12-3'
    # model_name = 'facebook/bart-large-cnn'
    # model_name = 'knkarthick/MEETING_SUMMARY'
    model_name = 'suriya7/bart-finetuned-text-summarization'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
    
    # transcript = get_transcript(url) 
    
    target_char_len = int(len(transcript) * percent_len)
    curr_char_len = len(transcript)
    
    print(f'Transcript Length:\t{len(transcript)}\tTarget Char Length:\t{target_char_len}')
      
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=800 * .1
    )

    chunks = text_splitter.split_text(transcript)
    print(f'Num Initial Chunks:\t{len(chunks)}')
    
    count = 0
    while curr_char_len > target_char_len:
        chunk_summaries = []
        
        # linear
        # scaling_factor = (curr_char_len - target_char_len) / target_char_len
        # exponential
        # scaling_factor = math.exp(-0.05 * (curr_char_len - target_char_len) / target_char_len)
        
        # sigmoid
        k = 1.25  # sharp transition
        scaling_factor = 1 / (1 + math.exp(-k * (curr_char_len - target_char_len) / target_char_len))
        
        min_max_len = 80
        min_min_len = 50
        
        max_length = max(min_max_len, int(300 - scaling_factor * (300 - min_max_len)))  # Ensure max_length doesn't go below 100
        min_length = max(min_min_len, int(150 - scaling_factor * (150 - min_min_len))) 
        

        print(f'Max Length:\t{max_length}\tMin Length:\t{min_length}\tScaling Factor:\t{round(scaling_factor, 2)}')
        
        for chunk in tqdm(chunks):
            inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)
            outputs = model.generate(
                inputs['input_ids'],
                max_length=max_length, 
                min_length=min_length, 
                length_penalty=2, 
                num_beams=6,
                no_repeat_ngram_size=3
            )
            chunk_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

            chunk_summaries.append(chunk_summary)
            
        summary = ''.join([summary for summary in chunk_summaries])
        summary = re.sub(r'\s+\.', '.', summary)
        curr_char_len = len(summary)
        print(f'Curr Char Length:\t{curr_char_len}')
        
        count += 1
        if curr_char_len <= target_char_len or (target_char_len * .9) <= curr_char_len <= (target_char_len * 1.1):
            summary = re.sub(r'\.(\S)', r'. \1', summary)
            print(f'Output Char Length:\t{curr_char_len}')
            return summary
        
        chunks = text_splitter.split_text(summary)
        print(f'Num chunks after {count} iterations:\t{len(chunks)}\tCurr Char Length:\t{curr_char_len}\tTarget Char Length:\t{target_char_len}')

In [None]:
final_output = summarize(t, .5) 

Transcript Length:	15913	Target Char Length:	7956
Num Initial Chunks:	22
Max Length:	128	Min Length:	72	Scaling Factor:	0.78


100%|██████████| 22/22 [00:28<00:00,  1.29s/it]

Curr Char Length:	7714
Output Char Length:	7714





In [None]:
text = textwrap.fill(final_output, width=150) 
print(text)

It's been a while since we've looked at a vintage chart, but it's time to get back to the old school again with a look at the US political party chart
from the 1880s, which was published in the book Conspectus of the History of Political Parties and the Federal Government by Walter R. Houghton of
Indiana University. The US has always had two main political parties, the Republicans and the Democrats, but there have been several other older
parties as well, including the Whigs, the Federalists, the Democratic Republicans, and the Whig Party, according to a new study by Professor J. R.
Houghton of Indiana University, who has been studying the history of US politics. During the American Revolution, the Whigs and the Tories were the
two main political parties in the United States, with the Whig party being the party that supported the monarchy and the loyalist Tories those who
opposed it, but the two parties have been at odds for much of the history of the country since the end of the Revol