In [12]:
from transformers import BartForConditionalGeneration, BartTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transcriptor import get_transcript
import math
import torch
import textwrap
from tqdm import tqdm
import re


In [13]:
t = get_transcript('https://www.youtube.com/watch?v=0QTULO9R83E')
print('Transcription Completed!')

[youtube] Extracting URL: https://www.youtube.com/watch?v=0QTULO9R83E
[youtube] 0QTULO9R83E: Downloading webpage
[youtube] 0QTULO9R83E: Downloading ios player API JSON
[youtube] 0QTULO9R83E: Downloading mweb player API JSON
[youtube] 0QTULO9R83E: Downloading m3u8 information
[info] 0QTULO9R83E: Downloading 1 format(s): 251
[download] Destination: audio.webm
[download] 100% of   36.37MiB in 00:00:01 at 20.51MiB/s    
[ExtractAudio] Destination: audio.mp3
Deleting original file audio.webm (pass -k to keep)




Transcription Completed!


In [14]:
t_wrap = textwrap.fill(t, width=150)
print(t_wrap)

 I had a dream which was not all a dream. The bright sun was extinguished and the stars did wander darkling in the eternal space, rayless and
pathless, and the icy earth swung blind and blackening in the moonless air. Morn came and went and came and brought no day, and all hearts were
chilled into a selfish prayer for light, and men were gathered round their blazing homes to look once more into each other's face. Happy were those
who dwelt within the eye of the volcanoes and their mountain torch. A fearful hope was all the world contained. Forests were set on fire, but hour by
hour they fell and faded, and the crackling trunks extinguished with a crash, and all was black. It was the disaster that shook the 1930s. One of
America's finest and newest ocean liners had set to sea. Aboard were hundreds of holidaymakers and families, newlyweds and businessmen all coming back
from Cuba. The voyage had been routine, fun, but then a shock. The captain, a well-liked veteran of the sea, died sudde

In [15]:
# 'facebook/bart-large-cnn'
# 'knkarthick/MEETING_SUMMARY'
# 'suriya7/bart-finetuned-text-summarization'


def summarize(transcript, percent_len=.5):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # model_name = 'sshleifer/distilbart-cnn-12-3'
    # model_name = 'facebook/bart-large-cnn'
    # model_name = 'knkarthick/MEETING_SUMMARY'
    model_name = 'suriya7/bart-finetuned-text-summarization'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
    
    # transcript = get_transcript(url) 
    
    target_char_len = int(len(transcript) * percent_len)
    curr_char_len = len(transcript)
    
    print(f'Transcript Length:\t{len(transcript)}\tTarget Char Length:\t{target_char_len}')
      
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=800 * .1
    )

    chunks = text_splitter.split_text(transcript)
    print(f'Num Initial Chunks:\t{len(chunks)}')
    
    count = 0
    while curr_char_len > target_char_len:
        chunk_summaries = []
        
        # linear
        # scaling_factor = (curr_char_len - target_char_len) / target_char_len
        # exponential
        # scaling_factor = math.exp(-0.05 * (curr_char_len - target_char_len) / target_char_len)
        
        # sigmoid
        k = 1.25  # sharp transition
        scaling_factor = 1 / (1 + math.exp(-k * (curr_char_len - target_char_len) / target_char_len))
        
        min_max_len = 80
        min_min_len = 50
        
        max_length = max(min_max_len, int(300 - scaling_factor * (300 - min_max_len)))  # Ensure max_length doesn't go below 100
        min_length = max(min_min_len, int(150 - scaling_factor * (150 - min_min_len))) 
        

        print(f'Max Length:\t{max_length}\tMin Length:\t{min_length}\tScaling Factor:\t{round(scaling_factor, 2)}')
        
        for chunk in tqdm(chunks):
            inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)
            outputs = model.generate(
                inputs['input_ids'],
                max_length=max_length, 
                min_length=min_length, 
                length_penalty=2, 
                num_beams=6,
                no_repeat_ngram_size=3
            )
            chunk_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

            chunk_summaries.append(chunk_summary)
            
        summary = ''.join([summary for summary in chunk_summaries])
        summary = re.sub(r'\s+\.', '.', summary)
        curr_char_len = len(summary)
        print(f'Curr Char Length:\t{curr_char_len}')
        
        count += 1
        if curr_char_len <= target_char_len or (target_char_len * .9) <= curr_char_len <= (target_char_len * 1.1):
            summary = re.sub(r'\.(\S)', r'. \1', summary)
            print(f'Output Char Length:\t{curr_char_len}')
            return summary
        
        chunks = text_splitter.split_text(summary)
        print(f'Num chunks after {count} iterations:\t{len(chunks)}\tCurr Char Length:\t{curr_char_len}\tTarget Char Length:\t{target_char_len}')

In [16]:
final_output = summarize(t, .5)

Transcript Length:	34733	Target Char Length:	17366
Num Initial Chunks:	49
Max Length:	128	Min Length:	72	Scaling Factor:	0.78


100%|██████████| 49/49 [01:25<00:00,  1.74s/it]

Curr Char Length:	15754
Output Char Length:	15754





In [17]:
text = textwrap.fill(final_output, width=150)
print(text)

"I had a dream which was not all a dream. The bright sun was extinguished and the stars did wander darkling in the eternal space, rayless and
pathless, and the icy earth swung blind and blackening in the moonless air," writes the author, "and all was black, and men were gathered round their
blazing homes to look once more into each other's face". The SS Morrow Castle was one of the most famous ships in the history of the world when it was
destroyed by fire in the early hours of the morning on the morning of December 31, 1969, off the coast of New York City, New York. The ship's captain
had been killed in a fire that destroyed the ship, and the passengers were left to burn to death. It's been 90 years since the tragic sinking of the
ship the Morrow Castle off the coast of New York City, but the story of what really happened on board is still being pieced together by experts in the
field of marine archaeology and shipwreck research, and it's time to find out if it really was an accident 