# Constants

In [1]:
class Parameters():
    batch_size:int=16
    EMBEDDING_MODEL:str="text-embedding-ada-002"
    COMPLETION_MODEL:str="gpt-3.5-turbo-16k-0613"
    AVERAGE_TOPIC_WORDS:int=500
    CONTEXT_LENGTH:int=16000 # The permitted context length in tokens
    SUMMARIZE_TOPIC_TOKENS:int=14000 # 14000 The last to-summary-text token num
    PHASE_1_MODEL:str="gpt-3.5-turbo-0613"
    PHASE_1_CHUNK_LEN:int=1500 # Number of tokens
    PHASE_1_PARA_LEN:int=200 # token num
    PHASE_1_PARAGRAPH_NUM:int=PHASE_1_CHUNK_LEN//PHASE_1_PARA_LEN # prompt to divide each chunk into at least 5 paragraphs - one paragraph with 200 tokens
    PHASE_1_CHUNK_LEN_IN_WORD:int=0 # Number of words, updated by function `get_phase_1_chunks`
    PHASE_1_CHUNK_OVERLAY:int=30 # Overlapped word number
    PHASE_1_CONTEXT_LENGTH:int=4096 # Phase 1 is for cleanup text. It might use a different model
    TOPIC_SUMMARY_LENGTH:int=800 # The length of topic summary in words
    FINAL_SUMMARY_LENGTH:int=1200 # The length of final summary in words


## Little utilities

In [2]:
import time, re, os, json
import asyncio
import concurrent.futures

from typing import List, Tuple, Dict

from openaikey import OPENAI_API_KEY
import openai
openai.api_key = OPENAI_API_KEY

import random

from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
import tiktoken

# llm = ChatOpenAI(model_name=Parameters.COMPLETION_MODEL)

In [3]:
def split_text(txt:str)->List[str]:
    splited_word_list = []
    word = ''
    # import ipdb; ipdb.set_trace()
    for c in txt:
        if len(c)==0:
            continue
        ord_c = ord(c)
        if word!='' and ord_c in (list(range(0x00, 0x20+1)) + [0x7F] + [0xFF]):
            splited_word_list.append(word)
            word = ''
        elif ord_c in (list(range(0x21, 0x7E+1)) + list(range(0x80, 0xFE+1))):
            word += c
        elif ord_c > 0xFF:
            if word!='':
                splited_word_list.append(word)
            splited_word_list.append(c)
            word = ''
    if word != '':
        splited_word_list.append(word)
    return splited_word_list

def join_texts(txt_list:List[str])->str:
    if type(txt_list)==str:
        return txt_list
        
    txt_list_new = []
    non_asci = False
    non_asci_word = ''
    for w in txt_list:
        if len(w)==0:
            continue
        ord_c = ord(w[0])
        if ord_c > 0xFF: # non-ASCI
            non_asci = True
            non_asci_word += w
        else: # ASCI
            if non_asci_word != '':
                txt_list_new.append(non_asci_word)
                non_asci = False
                non_asci_word = ''
            txt_list_new.append(w)
    if non_asci_word != '':
        txt_list_new.append(non_asci_word)
    return ' '.join(txt_list_new)

# Get the number of words
def strlen(txt:str)->int:
    txt1 = re.sub(r"[\x21-\x7E\x80-\xFE]+", 'A', txt)
    txt1 = re.sub(r"[\x00-\x20\xFF]", '', txt1)
    length = len(txt1)
    return length

# Get the number of tokens
def get_num_tokens(txt:str, model:str=Parameters.COMPLETION_MODEL)->int:
    # global llm
    # if 'llm' not in globals():
    #     llm = ChatOpenAI(model_name=model)
    # global llm_encoding
    # return llm.get_num_tokens(txt)

    global llm_encoding
    if 'llm_encoding' not in globals():
        llm_encoding = tiktoken.encoding_for_model(model)
    return len(llm_encoding.encode(txt))


# Concurrent OpenAI Call

#### Batch generate using OpenAI

```python
def batch_generate(prompts:List[str], batch_size:int=8, temperature:float=0.2)->List[str]:
```

In [4]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb

def _num_tokens_from_messages(messages:List[str], model:str)->int:
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if re.match('^gpt-3.5-turbo.*$', model):
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif re.match('^gpt-4.*$', model):
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

def openai_usage_stat()->Dict[str, object]:
    if 'openai_usages' not in globals() or len(openai_usages)==0:
        return {'object': '', 'model': '', 'prompt_tokens':0, 'completion_tokens': 0, 'total_tokens': 0}
    _usages = []
    # import ipdb; ipdb.set_trace()
    unique_items = list(set(tuple(sorted(d.items())) for d in [{'object': c.get('object'), 'model': c.get('model')} for c in openai_usages]))
    unique_items = [dict(items) for items in unique_items]
    # print(unique_items)
    for item in unique_items:
        object_name = item.get('object')
        model_name = item.get('model')
        _usage = {'object': object_name, 'model': model_name, 
                  'prompt_tokens': sum([x.get('prompt_tokens') for x in openai_usages if x.get('object') == object_name and x.get('model') == model_name]), 
                  'completion_tokens': sum([x.get('completion_tokens') for x in openai_usages if x.get('object') == object_name and x.get('model') == model_name]), 
                  'total_tokens': sum([x.get('total_tokens') for x in openai_usages if x.get('object') == object_name and x.get('model') == model_name])}
        _usages.append(_usage)

    cost = 0.
    for a in openai_usages:
        prompt_tokens = a.get('prompt_tokens')
        completion_tokens = a.get('completion_tokens')
        model_name = a.get('model')
        object_name = a.get('object')
        # import ipdb; ipdb.set_trace()
        if re.match(r'^gpt-3.5.*16k.*$', model_name, re.IGNORECASE) and re.match(r'^chat\.', object_name, re.IGNORECASE): # Chat model, 16K context
            cost += prompt_tokens * 0.003 / 1000
            cost += completion_tokens * 0.004 / 1000
        elif re.match(r'^gpt-3.5.*$', model_name, re.IGNORECASE) and re.match(r'^chat\.', object_name, re.IGNORECASE): # Chat model, 4K context 
            cost += prompt_tokens * 0.0015 / 1000
            cost += completion_tokens * 0.002 / 1000

    return {"TokenUsage": _usages, "Dollar_Usage": cost}

def _get_completion_func(model=Parameters.COMPLETION_MODEL, context_length:int=Parameters.CONTEXT_LENGTH, temperature:float=0.3, request_timeout:int=300): # Andrew mentioned that the prompt/ completion paradigm is preferable for this class
    # help: https://platform.openai.com/docs/api-reference/chat/create
    global openai_usages
    if 'openai_usages' not in globals():
        openai_usages = []
    
    MAX_RETRY_TIMES = 3
    @retry(wait=wait_random_exponential(multiplier=1, min=1, max=120), stop=stop_after_attempt(MAX_RETRY_TIMES))
    def _openai_completion(prompt:str):
        openai.api_key = OPENAI_API_KEY
        system_content = "You're a meticulous and careful AI assistant who pays extreme attention to details and does not overlook any important information, especially for technical steps."
        messages = [{"role": "user", "content": prompt}] # {"role": "system", "content": system_content},
        messages = [{"role": "user", "content": prompt}, {"role": "system", "content": system_content},]
        num_prompt_tokens = _num_tokens_from_messages(messages, model)
        max_tokens=context_length - num_prompt_tokens
        print(f"Inside _get_completion_func: num_prompt_tokens = {num_prompt_tokens:,d}, max_tokens = {max_tokens:,d}, model = {model}, context_length = {context_length:,d}")
        # print(messages)
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            max_tokens=max_tokens, # do we really need this "max_tokens"?
            request_timeout=request_timeout,
            temperature=temperature, # this is the degree of randomness of the model's output
        )
        usage = response.get('usage')
        if usage:
            usage = dict(usage)
            usage["model"] = response.get('model')
            usage["object"] = response.get('object')
            openai_usages.append(usage)
        return response.choices[0].get('message').get('content')
    return _openai_completion


def _batch_generate(prompts:List[str], batch_size:int=4, temperature:float=0.3, model:str=Parameters.COMPLETION_MODEL, context_length:int=Parameters.CONTEXT_LENGTH, request_timeout:int=300):
    final_results = []
    num_prompts = len(prompts)
    start_pos = 0
    futures = []
    while start_pos < num_prompts:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for prompt in prompts[start_pos:start_pos+batch_size]:
                num_words = strlen(prompt)
                num_prompt_tokens = get_num_tokens(prompt)
                if num_prompt_tokens / num_words > 1.7: # We have too much tokens from the words and thus we need to turn to the model with longer context
                    print(f"num_prompt_tokens / num_words = {num_prompt_tokens / num_words:.2f}, let's switch to a model with longer context!!")
                    completion_func = _get_completion_func(model=Parameters.COMPLETION_MODEL, temperature=temperature, request_timeout=request_timeout, context_length=Parameters.CONTEXT_LENGTH)
                else:
                    completion_func = _get_completion_func(model=model, temperature=temperature, request_timeout=request_timeout, context_length=context_length)
                futures.append(executor.submit(completion_func, prompt))
                # futures = [executor.submit(get_completion, prompt) for prompt in prompts[start_pos:start_pos+batch_size]]
            start_pos += batch_size
    results = [future.result() for future in futures]
    final_results += results

    return final_results

def batch_cleanup(chunks:List[str], temperature:float=0.1, batch_size:int=4, model:str=Parameters.PHASE_1_MODEL, context_length:int=Parameters.PHASE_1_CONTEXT_LENGTH):
    #   Divide it into at least {strlen(chunk)//300 + (1 if strlen(chunk) % 300 != 0 else 0):d} sentimentally coherent sections, using "\n\n" as a delimiter. Translate the text into English, unless it's already in English or Chinese.
    prompts = [f"""Please add (or update with) proper punctuations, like ',', '.', '?', to the following text. Correct obvious spelling or grammatic errors as well if possible. At the same time, divide the text into at least {Parameters.PHASE_1_PARAGRAPH_NUM:d} sentimentally coherent paragraphs, using "\n\n" as a delimiter. You should not give any extra comment.
    
    ```{chunk}```""" for chunk in chunks]    
    return _batch_generate(prompts, temperature=temperature, batch_size=batch_size, model=model, context_length=context_length)


def batch_summary(topic_groups:List[str], temperature:float=0.3, batch_size:int=4, summary_length:int=Parameters.TOPIC_SUMMARY_LENGTH, model:str=Parameters.COMPLETION_MODEL, context_length:int=Parameters.CONTEXT_LENGTH):
    model = model if model else Parameters.COMPLETION_MODEL
    print(f"batch_summary model = {model}")
    prompts = [f"""```{topic_group}```

You task is to give a verbose summary to the above article. The summary should be about {max(1, min(summary_length, int(strlen(topic_group)*0.5)))}-word-long in multi-paragraphs \
Summarize the above article. The summary should be about {max(1, min(summary_length, int(strlen(topic_group)*0.5)))}-word-long and should contain sufficient information of each key points. \
Do not miss any important technical details (like formulas, algorithms, reasoning steps, etc) or subtle nuances presented in the source material.
""" for topic_group in topic_groups]

    return _batch_generate(prompts, temperature=temperature, batch_size=batch_size, model=model, context_length=context_length)

def batch_rephrase(topic_groups:List[str], temperature:float=0., batch_size:int=1, model:str=Parameters.COMPLETION_MODEL, context_length:int=Parameters.CONTEXT_LENGTH):
    # prompts = [f"""Please rewrite the following text. Use English to rewrite if the provided text is not Chinese:
    prompts = [f"""Please rewrite the following text:
    ```\n{topic_group}\n```""" for topic_group in topic_groups]
    # print(prompts)
    return _batch_generate(prompts, temperature=temperature, batch_size=batch_size, model=model, context_length=context_length)

def batch_highlights(topic_groups:List[str], temperature:float=0.5, batch_size:int=1, num_highlights:int=7, model:str=Parameters.COMPLETION_MODEL, context_length:int=Parameters.CONTEXT_LENGTH):
    model = model if model else Parameters.COMPLETION_MODEL
    print(f"batch_highlights model = {model}")
    prompts = [f"""Guidelines: Please proceed in the following manner for your output:

#### Short Synopsis
#### Key Points
- [Emoji] Concise Bulletpoint

Your mission is to create an English summary of the text I've given you, delimited by ```, with a maximum of {num_highlights} succinct bullet points, prefixed by a proper emoji.

```{topic_groups}```
""" for topic_group in topic_groups]    
    return _batch_generate(prompts, temperature=temperature, batch_size=batch_size, model=model, context_length=context_length)

In [5]:
# final_summary = ['''In this article, the author interviews Jared Zonnerak, the co-founder of PromptLayer, a collaborative platform for teams and individuals to track, debug, and explore their language model application requests. The conversation revolves around the common denominator that all language model practitioners share: the prompt. Zonnerak explains that prompt engineering is a skill set that involves tinkering with the prompt to optimize the output of language models. He believes that prompt engineering is not just for engineers, but also for product managers and non-technical stakeholders.\n\nPromptLayer was launched in January and has since gained popularity among developers and teams. The platform provides a one-line code setup that allows users to track and analyze their logs, save prompt templates, and gain insights through analytics. It caters to both indie developers and teams, with features specifically designed for collaboration between technical and non-technical stakeholders.\n\nZonnerak discusses the journey of PromptLayer since its launch, highlighting the feature updates and the recent addition of support for the anthropic language model. The company has also raised funding to support its growth and is currently looking to hire a founding engineer.\n\nLooking ahead, Zonnerak shares that PromptLayer\'s roadmap includes focusing on improving the product for both hackers and teams. They aim to build workflows that allow teams to work on prompts collaboratively and efficiently. They are also working on features like unit testing and backtesting prompts, as well as A/B testing capabilities.\n\nThe conversation then shifts to the challenges of being a founder in the AI space and dealing with the constant influx of news and information. Zonnerak acknowledges the noise but emphasizes the importance of staying rooted in the needs and feedback of actual users. He believes that prompt engineering is a skill set that will continue to evolve and become more important in the future, with product managers potentially taking ownership of it.\n\nWhen asked about prominent prompt engineers in the industry, Zonnerak explains that prompt engineering is still a relatively new field and there are no defined experts or certifications. He suggests following builders and developers who are actively shipping products rather than those who only talk about prompt engineering. He also mentions the dynamics of how teams interact with prompts, explaining that before using PromptLayer, teams often relied on tools like Google Docs or databases to manage prompts.\n\nThe conversation then delves into the future of prompting, with Zonnerak sharing his belief that prompts will always be necessary, even as language models become more intelligent. He argues that humans themselves rely on prompts when communicating with each other, and prompts serve as a starting point for language models. He also highlights the importance of other variables in prompt engineering, such as the choice of model, temperature, and user segmentation.\n\nZonnerak recommends practicing prompt engineering by playing around with language models in playgrounds like OpenAI\'s playground. He also suggests reading Stephen Wolfram\'s articles on language models to gain a better understanding of the underlying technology.\n\nThe article concludes with Zonnerak discussing the serendipitous moments and challenges of attending events in the AI space. He mentions some under-the-radar companies he is excited about, such as Great for code refactoring and DeepTune for podcast dubbing AI. Zonnerak also shares his love for books and recommends Nassim Nicholas Taleb\'s "Antifragile" and "Skin in the Game."\n\nIn closing, Zonnerak advises developers and entrepreneurs to "just ship it" and not spend too much time strategizing. He encourages early product releases and invites readers to check out PromptLayer and connect with him on Twitter or via email.''']

# temperature = 0.0
# num_highlights = 5

# final_highlight = batch_highlights(final_summary, temperature=temperature, num_highlights=num_highlights, model=Parameters.PHASE_1_MODEL, context_length=Parameters.PHASE_1_CONTEXT_LENGTH)
# print(final_highlight[0])

# Prepare Data

## From Youtube

In [6]:
from youtube_transcript_api import YouTubeTranscriptApi
from typing import List, Dict

In [7]:
def get_transcript(video_id: str, languages:List[str]=['en', 'de', 'zh'])->List[Dict[str, object]]:
    results = YouTubeTranscriptApi.get_transcripts([video_id], languages=languages)[0][video_id]
    txt = join_texts([sub['text'] for sub in results])
    return txt

## From text file

In [8]:
def get_txt_from_file(txt_path:str)->str:
    def get_text(txt_path:str):
        with open(txt_path, 'r') as f:
              txt = f.read()
        return txt
    txt = get_text(txt_path)
    return txt

## Cleanup text

### Phase 1 chunks

In [9]:
def get_phase_1_chunks(txt:str, end_pos:int=None)->List[str]:
    num_words = strlen(txt[:end_pos])
    num_tokens = get_num_tokens(txt[:end_pos])
    num_chunk = num_tokens // Parameters.PHASE_1_CHUNK_LEN + 1 # use token to calculate the number of chunks
    chunk_size = num_words // num_chunk + 1 # use word number to really calculate chunk size (not easy to split text via token)
    Parameters.PHASE_1_CHUNK_LEN_IN_WORD = chunk_size
    txt_word_list = split_text(txt[:end_pos])
    phase_1_chunk_list = []
    start_pos = 0
    print(f"Total words: {num_words:,d} ({num_tokens:,d} tokens), chunk_size = {chunk_size:,d} words")
    for _ in range(num_chunk):
        while start_pos < num_words:
            phase_1_chunk_list.append(join_texts(txt_word_list[start_pos:start_pos + chunk_size + Parameters.PHASE_1_CHUNK_OVERLAY]))
            print(f"strlen(phase_1_chunk_list[-1]) = {strlen(phase_1_chunk_list[-1]):,d}, from {start_pos:,d} to {start_pos + strlen(phase_1_chunk_list[-1]):,d}")
            start_pos += chunk_size
    return phase_1_chunk_list

### Get chunks


In [10]:
# cleaned_chunk_list = ['aaaa als. ', 'asdkw ', 'sldf 2.' , 'als 2,3. ', 'sal ab.', 'aserl s2. 332aaaa.', 'aserl s2. 223 aaaa.']
# [c.strip()+"\n\n" if re.match(r'.*[\s]+[^0-9]+\.[\s]*$', c) else c.strip() for c in cleaned_chunk_list]

In [11]:
def _split_large_chunks(cleaned_chunk_list0:List[str])->List[str]:
    cleaned_chunk_list = []
    for chunk in cleaned_chunk_list0:
        for p in re.split(r'[\n]+', chunk): # p: paragraph
            n_tokens = get_num_tokens(p)
            if n_tokens > int(Parameters.PHASE_1_PARA_LEN * 2):
                n_paraph = n_tokens // Parameters.PHASE_1_PARA_LEN + 1
                paraph_token_len = n_tokens // n_paraph
                print(f"We're going to split this sub-chunk as it has {n_tokens:,d} tokens, larger than {int(Parameters.PHASE_1_PARA_LEN * 2):,d}. Words number: {strlen(p)}. It'll be split to {n_paraph} chunks.")
                sub_chunk = []
                for s in re.split(r'[.]', p):
                    if s=='':
                        continue
                    s += "."
                    sub_chunk_text = ''.join(sub_chunk)
                    if get_num_tokens(sub_chunk_text)+get_num_tokens(s)>=Parameters.PHASE_1_PARA_LEN:
                        cleaned_chunk_list.append(sub_chunk_text)
                        sub_chunk = []
                    sub_chunk.append(s)
                if len(sub_chunk)>0:
                    sub_chunk_text = ''.join(sub_chunk)
                    if len(cleaned_chunk_list)>0 and get_num_tokens(sub_chunk_text) + get_num_tokens(cleaned_chunk_list[-1]) < int(Parameters.PHASE_1_PARA_LEN*1.6):
                        cleaned_chunk_list[-1] = cleaned_chunk_list[-1] + sub_chunk_text
                    else:
                        cleaned_chunk_list.append(sub_chunk_text)
            else:
                cleaned_chunk_list.append(p)
    
    cleaned_chunk_list = [c.strip()+"\n\n" if re.match(r'.*[\s]+[^0-9]+\.[\s]*$', c) else c.strip() for c in cleaned_chunk_list]
    print("We've split large chunks, by words, from:", [strlen(c) for c in cleaned_chunk_list0], "to:", [strlen(c) for c in cleaned_chunk_list], 'by tokens, from:', [get_num_tokens(c) for c in cleaned_chunk_list0], "to:", [get_num_tokens(c) for c in cleaned_chunk_list])
    return cleaned_chunk_list

# We're going to clean the overlap here
def get_chunks(cleaned_chunk_list:List[str], overlapped:int)->(str,List[str]):
    if overlapped != 0:
        cleaned_chunk_list = [re.sub(r'\W*$', '', chunk) for chunk in cleaned_chunk_list]
    
    def _resolve_overlap(t0:str, t1:str)->Tuple:
        # import ipdb; ipdb.set_trace()
        t0_wordlist = split_text(t0)
        search_length = len(join_texts(t0_wordlist[-overlapped:]))
        search_start_pos = max(1, len(join_texts(t0_wordlist[-overlapped//5:]))) # Give the "leftOver" enough words
        
        t0_match = re.search(r'(\n\n )*[\.!?;:,*/-]', t0[-search_start_pos:-search_length:-1])
        if not t0_match:
            search_start_pos = max(1, len(join_texts(t0_wordlist[int(-overlapped//1.5):]))) # Give the "leftOver" more words
            t0_match = re.search(r'(\n\n )*[ ]', t0[-search_start_pos:-search_length:-1]) # Start from any word
        
        t1_match_start = 0

        if t0_match:
            t0_endAt = len(t0) - t0_match.start() - search_start_pos + 1
            leftOver = [word for word in split_text(t0[t0_endAt:]) if word != '']
            num_search_word = 8
            print('leftOver: ', leftOver)
            while num_search_word>0 and len(leftOver)>0:
                start_from_word_idx = random.randint(0, min(len(leftOver)-1, len(leftOver)//2))
                # print(f"len(leftOver) = {len(leftOver):,d}, start_from_word_idx = {start_from_word_idx:,d}")
                word_to_search = join_texts(leftOver[start_from_word_idx:start_from_word_idx+min(num_search_word, len(leftOver) - start_from_word_idx)])
                # print(f"word_to_search: {word_to_search}")
                search_in = t1[:search_length]
                t1_match = search_in.lower().find(word_to_search.lower())
                # t1_match = re.search(word_to_search, t1[:search_length], re.IGNORECASE)
                if t1_match!=-1:
                    print(f"Matched!!! word_to_search: {word_to_search}")
                    # Find matching in the next chunk - easy to resolve the overlap
                    t1_match_start = t1_match # t1_match.start()
                    t0_endAt += len(join_texts(leftOver[:start_from_word_idx])) + 1
                    break
                if random.random() < 1/(10-num_search_word):
                    num_search_word -= 1
            if num_search_word == 0:
                # Can't find matching in the next chunk
                print(f"No matching for \"{leftOver}\"")
                t0_endAt = len(t0) - len(join_texts(t0_wordlist[-int(overlapped*2/3):])) # remove 2/3 the overlap
                t1_match_start = 0
        else:
            t0_endAt = len(t0) - len(join_texts(t0_wordlist[-int(overlapped*2/3):])) # remove 2/3 the overlap
            t1_match_start = 0
    
        # Remove overlap
        print("Original end of t0: ", t0[-search_length:])
        print("Original start of t1: ", t1[:search_length])
        t0 = t0[:t0_endAt].strip()
        t1 = t1[t1_match_start:].strip()
        print("End of t0: ", t0[-100:])
        print("Start of t1: ", t1[:100])
        print('#'*50)
        return t0, t1

    for i in range(len(cleaned_chunk_list)-1):
        t0 = cleaned_chunk_list[i]
        t1 = cleaned_chunk_list[i+1]
        if overlapped != 0:
            # remove overlaps
            t0, t1 = _resolve_overlap(t0, t1)
        cleaned_chunk_list[i] = t0.strip()
        cleaned_chunk_list[i+1] = t1.strip()

    cleaned_chunk_list = _split_large_chunks(cleaned_chunk_list)
    # import ipdb; ipdb.set_trace()
    chunks = []
    n_appended_short_chunk = 0
    joined_chunk_list = join_texts(cleaned_chunk_list) # This is the first round chunking that some items in the "cleaned_chunk_list" might be splitted arbitrarily
    if overlapped == 0: # This is NOT the first round chunking and each item in the "cleaned_chunk_list" should be regarded as a "whole chunk" already
        joined_chunk_list = '\n\n'.join(cleaned_chunk_list)
    for c in re.split(r'[\n]{2,}', joined_chunk_list):
        s_c = c.strip()
        # import ipdb; ipdb.set_trace()
        if strlen(s_c) < 15 and n_appended_short_chunk<5: # Combined with previous chunk if it's too short
            n_appended_short_chunk+=1
            if len(chunks)>0:
                chunks[-1] = chunks[-1].strip() + ' ' + s_c
            else:
                chunks.append(s_c)
        else:
            n_appended_short_chunk=0
            chunks.append(s_c)
    print("Length of splited chunks, in words:", [strlen(c) for c in chunks], "num_of_tokens:", [get_num_tokens(c) for c in chunks], f", Longest one with length: {max([strlen(c) for c in chunks]):,d} words ({max([get_num_tokens(c) for c in chunks]):,d} tokens)")    
    print("Length of cleaned_chunk_list, in words:", [strlen(c) for c in cleaned_chunk_list], "num_of_tokens:", [get_num_tokens(c) for c in cleaned_chunk_list], f", Longest one with length: {max([strlen(c) for c in cleaned_chunk_list]):,d} words ({max([get_num_tokens(c) for c in cleaned_chunk_list]):,d} tokens)")

    return chunks, cleaned_chunk_list

# Get topic groups

## Get Chunk embeddings from OpenAI

[OpenAI Create Embeddings](https://platform.openai.com/docs/api-reference/embeddings)

## Embedding based text similarity

In [12]:
import numpy as np
from langchain.embeddings import OpenAIEmbeddings
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

@retry(wait=wait_random_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(6))
def embedding(texts:List[str], doc_embedding:bool=True): # Andrew mentioned that the prompt/ completion paradigm is preferable for this class
    # help: https://platform.openai.com/docs/api-reference/embeddings
    model = Parameters.EMBEDDING_MODEL # 'text-embedding-ada-002' # https://platform.openai.com/docs/models/embeddings
    # response = openai.Embedding.create(
    #     model=model,
    #     input = texts
    # )
    # emds = [data['embedding'] for data in response.data]
    tic = time.time()
    embed_func = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) # https://python.langchain.com/en/latest/modules/models/text_embedding/examples/openai.html
    if doc_embedding:
        emds = embed_func.embed_documents(texts)
    else:
        emds = embed_func.embed_query(texts)
    toc = time.time()
    print(f"Time consumed for embedding {len(texts):,d} chunks is: {toc-tic:,.1f} seconds")
    return np.array(emds)

In [13]:
# Get similarity matrix between the embeddings of the chunk summaries
def get_text_similarity(text_embeds, bonus_constant:float=.25, bonus_power:float=1.):
    # import ipdb; ipdb.set_trace()
    num_1_chunks = text_embeds.shape[0]
    text_similarity_matrix = np.zeros((num_1_chunks, num_1_chunks))
    text_similarity_matrix[:] = np.nan
    
    for row in range(num_1_chunks):
        for col in range(row, num_1_chunks):
            # Calculate cosine similarity between the two vectors
            similarity = 1- cosine(text_embeds[row], text_embeds[col])
            text_similarity_matrix[row, col] = similarity
            text_similarity_matrix[col, row] = similarity
    
    # # Draw a heatmap with the text_similarity_matrix
    # plt.figure()
    # plt.title('Non adjusted Similarity matrix')
    # # Color scheme blues
    # plt.imshow(text_similarity_matrix, cmap = 'Blues')

    # Get the "distance-adjusted" similarity matrix
    proximity_bonus_arr = np.zeros_like(text_similarity_matrix)
    # Closer neighbors get higher bonus
    for row in range(proximity_bonus_arr.shape[0]):
        for col in range(proximity_bonus_arr.shape[1]):
            if row == col:
                proximity_bonus_arr[row, col] = 0
            else:
                proximity_bonus_arr[row, col] = 1/(abs(row-col))**bonus_power * bonus_constant # Closer neighbors get higher bonus
    dist_adj_text_similarity_matrix = text_similarity_matrix.copy() + proximity_bonus_arr # closer neighbors, even if had same similarity-score, should be regarded as `more similiar`. That's the meaning of "bonus"
    # plt.figure()
    # plt.title('Adjusted matrix via the temporal structure')
    # plt.imshow(dist_adj_text_similarity_matrix, cmap = 'Blues')
    return text_similarity_matrix, dist_adj_text_similarity_matrix

## Get Topics

In [14]:
def form_topics_via_topics_title(chunks:List[str], topics_title:List[List[int]]):
    topics = []
    for t in topics_title:
        topics.append(join_texts([chunks[i] for i in t]))
    print('Leng of topics, in words:', [strlen(topic) for topic in topics], "in tokens:", [get_num_tokens(topic) for topic in topics])
    return topics

## Similarity based Cluster - Topics

In [15]:
import networkx as nx
from networkx.algorithms import community

In [16]:
def plot_heatmap(list_data:list, title:str):
    # The list_data should be something like: [0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5]
    data = np.array(list_data).reshape(1, -1)
    plt.figure(figsize = (10, 4))
    plt.title(title)
    plt.imshow(data, cmap = 'tab20')
    for i in range(1, len(list_data)):
        plt.axvline(x = i - 0.5, color = 'black', linewidth = 0.5)

def _get_longest_topic_length(topics_title:List[List[int]], chunks:List[str]):
    _topics_length = []
    _topics_word_num = []
    for i, topic_title in enumerate(topics_title):
        _topics_length.append(sum([get_num_tokens(chunks[item]) for item in topic_title]))
        _topics_word_num.append(sum([strlen(chunks[item]) for item in topic_title]))
    longest_topic_length = max(_topics_length)
    longest_topic_word_num = max(_topics_word_num)
    index0 = _topics_length.index(longest_topic_length)
    index1 = _topics_word_num.index(longest_topic_word_num)
    return index0, longest_topic_length, index1, longest_topic_word_num

# Run the community detection algorithm to get something like: [{0, 1, 2, 3,}, {4, 5}, {6, 7, 8}, {9, 10, 11}]
def get_topics_title(text_embeds, num_topics:int, chunks:List[str], bonus_constant:float=0.25, min_size:int=3, 
               bonus_power:float=1., resolution:float=1., resolution_step:float=.1):

    longest_chunk_size = max([get_num_tokens(chunk) for chunk in chunks]) # The num of tokens of the longest chunk
    _, similarity_matrix = get_text_similarity(text_embeds)

    title_nx_graph = nx.from_numpy_array(similarity_matrix)

    desired_num_topics = num_topics
    # Store the accepted partitionings
    topics_title_accepted = []

    # Find the resolution that gives the desired number of topics
    topics_title = []
    idx = 1
    lower_bar = desired_num_topics -1
    upper_bar = lower_bar + 5
    print(f"number of desired topics, lower_bar = {lower_bar:,d}, upper_bar = {upper_bar:,d}")
    longest_topic_length = 9e9 # TODO
    num_retry_split_large_chunk = 0
    while len(topics_title) not in range(lower_bar, upper_bar) or longest_topic_length>max(longest_chunk_size, Parameters.SUMMARIZE_TOPIC_TOKENS):
        if len(topics_title) >= upper_bar:
            old_resolution, old_resolution_step = resolution, resolution_step
            if random.random() < 0.9:
                resolution *= 0.9
                if resolution_step > 1e-3:
                    resolution_step *= 0.9
            else:
                resolution *= 0.5
                if resolution_step > 1e-3:
                    resolution_step *= 0.9
            print(f"Adjusted resolution from {old_resolution:.4f} to {resolution:.4f}, resolution_step from {old_resolution_step:.4f} to {resolution_step:.4f}, at step: {idx:,d}, because we have {len(topics_title):,d} topics which is >= upper_bar ({upper_bar:,d}).")
        topics_title = community.louvain_communities(title_nx_graph, weight = 'weight', resolution = resolution)
        topics_title = [sorted(topic_title) for topic_title in topics_title] # Make the topic items sorted from: [{19, 20, 22, 21}, {24, 23}] to [{19, 20, 21, 22}, {23, 24}]
        # print("Sorted: ", topics_title)
        resolution += resolution_step
        if idx % 100 == 0:
            # import ipdb; ipdb.set_trace()
            print(f"idx = {idx:04d}, len(topics_title) = {len(topics_title):d}, resolution = {resolution:.4f}")
        idx += 1
        _, longest_topic_length, _, _  = _get_longest_topic_length(topics_title, chunks)
        if len(topics_title) in range(lower_bar, upper_bar) and longest_topic_length > Parameters.SUMMARIZE_TOPIC_TOKENS:
            old_resolution = resolution
            resolution = random.normalvariate(old_resolution, old_resolution*.1)
            print(f"Adjusted resolution from {old_resolution:.4f} to {resolution:.4f}, at step: {idx:,d}, because, though topics number in range, we have longest_topic_length as: {longest_topic_length:,d}, \
            which is > Parameters.SUMMARIZE_TOPIC_TOKENS ({Parameters.SUMMARIZE_TOPIC_TOKENS:,d}).")
            if num_retry_split_large_chunk > 2: # Increase the num of topics to split if the larget one can be split after a few times try
                upper_bar += 1
                print(f"Adjusted upper_bar from: {upper_bar-1:,d} to {upper_bar} because we have tried {num_retry_split_large_chunk:,d} times to split the largets topic smaller. The new upper_bar={upper_bar:,d}.")
                num_retry_split_large_chunk = 0
            else:
                num_retry_split_large_chunk += 1

    def _get_topics(topics_title:list):
        topic_id_means = [sum(e)/len(e) for e in topics_title]
        # Arrange title_topics in order of topic_id_means
        topics_title = [list(c) for _, c in sorted(zip(topic_id_means, topics_title), key = lambda pair: pair[0])]
        for t in topics_title:
            t.sort()
        # Create an array denoting which topic each chunk belongs to
        chunk_topics = [None] * similarity_matrix.shape[0]
        for i, c in enumerate(topics_title):
            for j in c:
                chunk_topics[j] = i
        return {'chunk_topics': chunk_topics, 'topics': topics_title}
    
    plot_heatmap(_get_topics(topics_title)['chunk_topics'], title='Chunk similarity based Topics')
    print(f"We have totally {len(topics_title):,d} topics detected: {topics_title}")
    print(f"resolution = {resolution:.4f}, resolution_step = {resolution_step:.4f}")
    longest_topic_index, longest_topic_tokens, longest_word_topic_index, longest_topic_word_num = _get_longest_topic_length(topics_title, chunks)
    if (longest_topic_index == longest_word_topic_index):
        print(f"Longest topic contains: {longest_topic_tokens:,d} tokens ({longest_topic_word_num:,d} words), at topic No. {longest_topic_index:,d}, starting from index 0.")
    else:
        print(f"Longest topic contains: {longest_topic_tokens:,d} tokens at: No. {longest_topic_index:,d} ({longest_topic_word_num:,d} words at No. {longest_word_topic_index:,d}), starting from index 0.")
    return topics_title

## Get Topic Groups

In [17]:
def get_topic_groups(topics:List[str]):
    topic_groups = []
    topic_group_length = 0
    _topic_group = []
    total_tokens = get_num_tokens(join_texts(topics))
    intended_num_of_topic_groups = total_tokens // Parameters.SUMMARIZE_TOPIC_TOKENS + 1
    MAX_TOPIC_GROUP_LENGTH = min(Parameters.SUMMARIZE_TOPIC_TOKENS, total_tokens // intended_num_of_topic_groups + 1)
    i = 0
    while i < len(topics):
        if get_num_tokens(topics[i]) > Parameters.SUMMARIZE_TOPIC_TOKENS:
            raise RuntimeError
        while i<len(topics) and topic_group_length + get_num_tokens(topics[i]) <= MAX_TOPIC_GROUP_LENGTH:
            _topic_group.append(topics[i])
            topic_group_length += get_num_tokens(topics[i])
            i += 1
        topic_groups.append(join_texts(_topic_group))
        _topic_group, topic_group_length = [], 0

    if len(topic_groups)>=2 and get_num_tokens(topic_groups[-2]) + get_num_tokens(topic_groups[-1]) < Parameters.SUMMARIZE_TOPIC_TOKENS:
        topic_groups[-2] = join_texts(topic_groups[-2:])
        topic_groups = topic_groups[:-1]
    words_topic_groups = [strlen(tg) for tg in topic_groups]
    tokens_topic_groups = [get_num_tokens(tg) for tg in topic_groups]
    print("Length of topic groups, in words:", words_topic_groups, "in tokens:", tokens_topic_groups, f" The longest one has: {max(words_topic_groups):,d} words ({max(tokens_topic_groups):,d} tokens)")
    return topic_groups

In [18]:
aaa = [7118, 5818, 7731, 998]
if len(aaa)>=2 and aaa[-2] + aaa[-1] < Parameters.SUMMARIZE_TOPIC_TOKENS:
    aaa[-2] = sum(aaa[-2:])
    aaa = aaa[:-1]
print(aaa)

[7118, 5818, 8729]


# Summarize each topic group

In [19]:
def summarize_topic_groups(topic_groups:List[str], summary_length:int=Parameters.TOPIC_SUMMARY_LENGTH, batch_size:int=8, temperature:float=0.)->List[str]:
    tic = time.time()
    phase_2_summaries = batch_summary(topic_groups, summary_length=summary_length, temperature=temperature, batch_size=batch_size)
    toc = time.time()
    print("The length of summarized-topic-groups, in words:", [strlen(summary) for summary in phase_2_summaries], "in tokens:", [get_num_tokens(summary) for summary in phase_2_summaries], "Summarized total words:", sum([strlen(summary) for summary in phase_2_summaries]), f"(total tokens: {sum([get_num_tokens(summary) for summary in phase_2_summaries]):,d})")
    print(f"Time consumed for topic groups summarization: {toc-tic:,.1f}s")
    return phase_2_summaries

# Final Summarization

## [Call OpenAI Functions](https://platform.openai.com/docs/guides/gpt/function-calling)

In [20]:
# to_summary = """In this article, the author discusses the use case for topic modeling and how it can be applied to various types of content, such as YouTube videos, podcasts, meeting notes, legal documents, movie scripts, books, and lecture notes. The author highlights the manual work involved in extracting topics from these sources and emphasizes the value of structured data in these contexts.\n\nThe author suggests that there is an opportunity to create a productionized service for extracting topics from podcasts and videos. They provide an example of a podcast website that does not have topics listed on their episodes, and propose the idea of offering a service to extract and provide topics for these episodes. This approach could be replicated for other podcasts, videos, or any series of information.\n\nThe author then introduces their tutorial on topic modeling, specifically focusing on a two-pass approach. In the first pass, they use a mapreduce method to process the entire document and extract topics and bullet points. They explain that this approach may be a bit expensive in terms of computational resources, but it allows for a comprehensive analysis of the text. In the second pass, they iterate through each topic bullet point and expand on them using a retrieval method. This involves a question and answer-like process to provide more detailed information and context.\n\nThe author makes several assumptions, including the absence of a table of contents or contents, and the desire for more control over the topic modeling process. They emphasize the importance of learning the ins and outs of building with AI and encourage readers to experiment with their own use cases.\n\nThe article then delves into the technical implementation of the topic modeling process. The author imports various packages and sets up two language models, GPT 3.5 Turbo and GPT 4. They load the transcript that will be parsed and split it into chunks using a recursive character text splitter. The author explains the reasoning behind the chunk size and overlap parameters, highlighting the need for experimentation based on individual use cases.\n\nNext, the author focuses on extracting topic titles and short descriptions. They create a custom prompt to instruct the language model on the desired output. They provide examples and formatting guidelines to ensure accurate extraction of topics. The author then runs the mapreduce method using the GPT 4 language model and consolidates the results to eliminate duplicates.\n\nThe article continues with the conversion of the extracted topics into structured data. The author defines a schema with properties for topic name, description, and tag. They demonstrate the structured data output for the extracted topics, showcasing the potential value of this approach.\n\nMoving on to the second pass of topic expansion, the author introduces the concept of using a retrieval method and the vector store dance. They explain that this approach allows for generating longer descriptions or summaries based on specific topics. The author sets up a custom prompt for this process and creates embeddings using the OpenAI embeddings engine. They initialize Pinecone, a remote vector store, and set up the retrieval QA process.\n\nThe author demonstrates how to iterate through the structured topics and generate expanded summaries using the retrieval method. They provide examples of the expanded topics, showcasing the additional information and context that can be obtained through this process.\n\nFinally, the author explores the topic of extracting chapters or timestamps from a transcript. They create a custom prompt for this purpose and use the retrieval QA process to find the first timestamp associated with each topic. The author provides an example of the retrieved timestamps and highlights the relevance of this information in organizing and navigating through content.\n\nIn conclusion, the article presents a comprehensive overview of topic modeling and its applications in various contexts. The author provides step-by-step instructions and technical details for implementing a two-pass approach to extract and expand on topics. They emphasize the value of structured data and the potential for creating productionized services based on topic modeling."""
# num_highlights = 5
# func_name = "output_key_information"

In [39]:
def highlight_with_openai_function(to_summary:str, num_highlights:int, temperature:float=0.):
    tic = time.time()

    messages = [{"role": "user", "content": to_summary}]
    num_tokens = _num_tokens_from_messages(messages, model=Parameters.COMPLETION_MODEL)
    if num_tokens > Parameters.PHASE_1_CONTEXT_LENGTH-1300:
        model = Parameters.COMPLETION_MODEL
        context_length = Parameters.CONTEXT_LENGTH
    else:
        model = Parameters.PHASE_1_MODEL
        context_length = Parameters.PHASE_1_CONTEXT_LENGTH
        
    print(f"Number of tokens for final highlight: {num_tokens:,d} (words: {strlen(to_summary):,d}). Highlight model: {model}")

    functions = [
        {
            "name": "output_key_information",
            "description": "Output key information including: title, short synopsis and bulleted key points from the given text.",
            "parameters": {
                "type": "object",
                "properties": {
                    "Title": {"type": "string", "description": "The title of the given text"},
                    "Short\ Synopsis": {"type": "string", "description": "A very short and concise summary of the given text."},
                    "Key\ Points": {
                        "type": "string",
                        "description": f"A list of maximum of {num_highlights} succinct bullet points, each prefixed by a proper emoji.",
                    },
                },
                "required": ["Title", "Short\ Synopsis", "Key\ Points"],
            },
        }
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages, 
        temperature=temperature,
        functions=functions,
        function_call="auto",  # auto is default, but we'll be explicit
    )
    
    usage = response.get('usage')
    if usage:
        global openai_usages
        if 'openai_usages' not in globals():
            openai_usages = []
        usage = dict(usage)
        usage["model"] = response.get('model')
        usage["object"] = response.get('object')
        openai_usages.append(usage)
    response_message = response["choices"][0]["message"]
    
    if response_message.get("function_call"):
        try:
            function_args = json.loads(response_message.get("function_call").get("arguments"))
        except Exception as e:
            import ipdb; ipdb.set_trace()
            print(response_message)
            print(f"An error occurs: {e}")
            
    # for key in function_args.keys():
    #     print(f"{key}: {function_args.get(key)}")
        
    toc = time.time()
    print(f"Time consumed for final highlight: {toc-tic:.1f}s")
    return function_args, response

In [22]:
def get_final_highlight(final_summary:str, num_highlights:int=7, temperature:float=0.):
    final_result, response = highlight_with_openai_function(final_summary, num_highlights=num_highlights, temperature=temperature)

    print("Final highlight:")
    for k in final_result.keys():
        print(f"\t{k}: {strlen(final_result.get(k)):,d} words ({get_num_tokens(final_result.get(k)):,d} tokens)")
    print("OpenAI usage for final highlight:")
    for k, v in dict(response.usage).items():
        print(f"\t{k}: {v:,d}")

    final_highlights = '\n\n'.join(["#### " + k + f" ({strlen(v):,d} words)" + "\n" + v for k, v in final_result.items()])

    return final_highlights

In [23]:
# final_highlights = get_final_highlight(to_summary, num_highlights=5, temperature=.2)

# print(final_highlights)

## Normal way to do final summary

In [24]:
def get_final_result(topic_groups_summarizations:List[str], num_highlights:int=7, temperature:float=0.):
    to_summary = [join_texts(topic_groups_summarizations)]
    num_words = strlen(to_summary[0])
    num_tokens = get_num_tokens(to_summary[0])
    print(f"To summary text length: {num_words:,d} (num of tokens: {num_tokens:,d})")
    # import ipdb; ipdb.set_trace()
    tic = time.time()
    if num_tokens < Parameters.PHASE_1_CONTEXT_LENGTH-1000:
        summary_model = Parameters.PHASE_1_MODEL
        summary_context_length = Parameters.PHASE_1_CONTEXT_LENGTH
    else:
        summary_model = Parameters.COMPLETION_MODEL
        summary_context_length = Parameters.CONTEXT_LENGTH

    # highlight_model = Parameters.PHASE_1_MODEL
    # highlight_context_length = Parameters.PHASE_1_CONTEXT_LENGTH
    final_summary = batch_summary(to_summary, temperature=temperature, batch_size=1, summary_length=Parameters.FINAL_SUMMARY_LENGTH, model=summary_model, context_length=summary_context_length)[0]
    toc = time.time()
    print(f"Time consumed for final summarization: {toc-tic:,.1f}s")
    print(f"To highlight text length: {strlen(final_summary):,d} (num of tokens: {get_num_tokens(final_summary):,d})")
    # tic = time.time()

    final_highlight =  get_final_highlight(final_summary, temperature=temperature, num_highlights=num_highlights) # batch_highlights(final_summary, temperature=temperature, num_highlights=num_highlights, model=highlight_model, context_length=highlight_context_length)
    # toc = time.time()
    # print(f"Time consumed for final higlights: {toc-tic:,.1f}s")
    final_result = final_highlight + f"\n\n#### Detailed Summary ({strlen(final_summary):,d} words)\n{final_summary}"
    print(f"Length of final highlight, in words: {strlen(final_highlight):,d}, in tokens: {get_num_tokens(final_highlight):,d}")
    print(f"Length of final summary, in words: {strlen(final_summary):,d}, in tokens: {get_num_tokens(final_summary):,d}")
    print(f"Length of final result, in words: {strlen(final_result):,d}, in tokens: {get_num_tokens(final_result):,d}")
    return final_result

# Main

## do_summary method

In [25]:
def do_summary(txt:str, output_file:str, num_highlights:int=7):
    # import ipdb; ipdb.set_trace()
    txt = re.sub(r'[ ]+', ' ', txt)
    num_txt_words = strlen(txt)
    print(f"Num of words: {num_txt_words:,d}")
    num_txt_tokens = get_num_tokens(txt)
    print(f"Num of tokens: {num_txt_tokens:,d}")
    Parameters.PHASE_1_CHUNK_LEN = max(Parameters.PHASE_1_CHUNK_LEN, min(2000, num_txt_tokens // (Parameters.batch_size+1)))

    if num_txt_tokens < Parameters.SUMMARIZE_TOPIC_TOKENS:
            final_result = get_final_result([txt], num_highlights=num_highlights, temperature=0.)
    else:
        # Get initial chunks
        phase_1_chunk_list = get_phase_1_chunks(txt)

        # Clean up text via OpenAI API
        tic = time.time()
        cleaned_chunk_list0 = batch_cleanup(chunks=phase_1_chunk_list, temperature=0., batch_size=Parameters.batch_size)
        toc = time.time()
        print("Length of cleaned chunks, in words:", [strlen(c) for c in cleaned_chunk_list0], "in tokens:", [get_num_tokens(c) for c in cleaned_chunk_list0])
        print(f"Time consumed for cleanup by OpenAI: {toc-tic:,.1f}s")

        overlapped = Parameters.PHASE_1_CHUNK_OVERLAY

        while (get_num_tokens(join_texts(cleaned_chunk_list0)) > Parameters.SUMMARIZE_TOPIC_TOKENS): # Keep the process until the to_summary_text is short enough
            # Get chunks
            chunks, cleaned_chunk_list = get_chunks(cleaned_chunk_list0, overlapped=overlapped)

            # Get embedding for the cunks
            chunks_embedding = embedding(chunks)

            # Get topic titles (something like: [[0, 1, 2, 3,], [4, 5], [6, 7, 8], [9, 10, 11]])
            topics_title = get_topics_title(text_embeds=chunks_embedding, num_topics=strlen(txt) // Parameters.AVERAGE_TOPIC_WORDS, chunks=chunks, )

            # Get topics and topic_groups
            topics = form_topics_via_topics_title(chunks, topics_title)
            topic_groups = get_topic_groups(topics)

            # Get summarizations of each group via OpenAI API
            phase_2_summaries = summarize_topic_groups(topic_groups, summary_length=Parameters.TOPIC_SUMMARY_LENGTH, temperature=0.2, batch_size=Parameters.batch_size)
            cleaned_chunk_list0 = phase_2_summaries
            overlapped=0

        # Final result
        final_result = get_final_result(cleaned_chunk_list0, num_highlights=num_highlights, temperature=0.2)

    # Output final result
    print('\n\n')
    print(final_result)

    with open(output_file, 'w') as f:
        f.write(final_result)

    usages = openai_usage_stat()
    print("\n#### OpenAI Usage:")
    print("Token Usage:", usages.get('TokenUsage'))
    print(f"Cost: {usages.get('Dollar_Usage')*100:,.2f} US cents")
    return final_result

## Different data sources

### Youtube

In [26]:
def do_youtube_summary(video_id:str, num_highlights:int=7, output_file:str=None, start_pos:int=None, end_pos:int=None):
    if not output_file:
        output_file = f"results/youtube_{video_id}.txt"
    # Get text
    txt = get_transcript(video_id)
    # Do summary
    final_result = do_summary(txt[start_pos:end_pos], output_file=output_file, num_highlights=num_highlights)
    
    return final_result

### Bilibili

[https://github.com/huilongyeo/bilibiliGetSrt](https://github.com/huilongyeo/bilibiliGetSrt)

In [27]:
# class Bilibili:
#     def __init__(self, video_url:str):
#         self.video_url = video_url
#         pass
    
#     def get_subtitles(self):
#     """
#     根据字幕的json 获取CC字幕

#     """        
#     json_url=self.get_json_url()
#     if len(json_url)>0:
#         with open('{}.srt'.format(self.title),'w',encoding='utf-8') as f:
#             r=requests.get(json_url)
#             info=json.loads(r.text)['body']
#             for i in range(len(info)):
#                 subtitle_from=info[i]['from']
#                 subtitle_to=info[i]['to']
#                 content=info[i]['content']
#                 data=self.format_subtitle(subtitle_from,subtitle_to,content,i)
#                 f.write(data)

#     def format_subtitle(self,subtitle_from,subtitle_to,content,i):        
#         """
#         格式化成srt文件，形如：

#         1
#         00:00:01,035 --> 00:00:04,525
#         远离了平行线 看吧天气预报也不怎么准

#         """
#         subtitle_from=round(subtitle_from,3)                                              #四舍五入为三位小数
#         subtitle_to=round(subtitle_to,3)
#         begin=time.strftime("%H:%M:%S",time.gmtime(subtitle_from))+','+self.rectify(subtitle_from)        
#         end=time.strftime("%H:%M:%S",time.gmtime(subtitle_to))+','+self.rectify(subtitle_to)
#         data=str(i+1)+'\n'+begin+' --> '+end+'\n'+content+'\n\n'                          #格式化成srt字幕
#         return data

### Text file

In [28]:
def do_txtfile_summary(txt_path:str, num_highlights:int=7, output_file:str=None, start_pos:int=None, end_pos:int=None):
    if not output_file:
        output_file = re.sub(r".*/", "", txt_path)
        output_file = "results/text_" + re.sub(r"\.[^.]*$", "", output_file) + ".txt"
    # Get text
    txt = get_txt_from_file(txt_path)
    # import ipdb; ipdb.set_trace()
    # Do summary
    final_result = do_summary(txt[start_pos:end_pos], output_file=output_file, num_highlights=num_highlights)
    
    return final_result

### PDF

In [29]:
%%capture
import sys
!{sys.executable} -m pip install PyPDF2 requests # aspose-words

import requests
from PyPDF2 import PdfReader
from urllib.parse import urlparse

def open_pdf(pdf_resource:str, startPage:int=1, endPage:int=None):
    # Check if the input name is a URL or a local filename
    parsed_url = urlparse(pdf_resource)
    is_url = parsed_url.scheme != '' and parsed_url.netloc != ''

    if is_url:
        # The input name is a URL
        response = requests.get(pdf_resource)
        pdf_content = response.content

        # Create a temporary PDF file from the URL content
        pdf_file = open('temp.pdf', 'wb')
        pdf_file.write(pdf_content)
        pdf_file.close()

        # Open the PDF file using PyPDF2
        pdf = PdfReader('temp.pdf')
        # Access the PDF document properties or perform other operations
        pages = pdf.pages
        # import ipdb; ipdb.set_trace()
        print(f"The PDF has {len(pages):,d} pages.")

        # Clean up the temporary PDF file
        import os
        os.remove('temp.pdf')
    else:
        # The input name is a local filename
        pdf = PdfReader(pdf_resource)
        # Access the PDF document properties or perform other operations
        pages = pdf.pages
        print(f"The PDF has {len(pages):,d} pages.")
    
    txt = ''.join([c.extract_text() for c in pages[startPage-1:endPage]])
    return pdf, txt

In [30]:
def do_pdf_summary(pdf_source:str, pdf_name:str, startPage:int=1, endPage:int=None, num_highlights:int=7, output_file:str=None, start_pos:int=None, end_pos:int=None):
    if not output_file:
        output_file = re.sub(r".*/", "", pdf_source)
        output_file = "results/pdf_" + re.sub(r"\.[^.]*$", "", output_file) + ".txt"
    if pdf_name:
        output_file = f"""results/pdf_{re.sub("/", "_", pdf_name)}.txt"""
    # Get text
    pdf,  txt = open_pdf(pdf_source, startPage, endPage)
    print(f"We have {strlen(txt):,d} words ({get_num_tokens(txt):,d}) tokens in the PDF file.")
    # import ipdb; ipdb.set_trace()
    # Do summary
    final_result = do_summary(txt[start_pos:end_pos], output_file=output_file, num_highlights=num_highlights)
    
    return final_result

# Testing

## Text file

In [43]:
if 'openai_usages' in globals():
    del openai_usages

tic = time.time()    
txt_path = 'data/mls.txt'
txt_path = 'data/stateoftheunion.txt'
txt_path = 'data/deeplearning.ai/diffusion/01-intuition.txt'

final_result = do_txtfile_summary(txt_path, num_highlights=4, start_pos=None, end_pos=None)
toc = time.time()
print(f"\nAltogether time consumed: {toc-tic:,.1f} seconds")

Num of words: 784
Num of tokens: 1,001
To summary text length: 784 (num of tokens: 1,001)
batch_summary model = gpt-3.5-turbo-0613
Inside _get_completion_func: num_prompt_tokens = 1,124, max_tokens = 2,972, model = gpt-3.5-turbo-0613, context_length = 4,096
Time consumed for final summarization: 11.6s
To highlight text length: 415 (num of tokens: 454)
Number of tokens for final highlight: 462 (words: 415). Highlight model: gpt-3.5-turbo-0613
Time consumed for final highlight: 4.4s
Final highlight:
	Title: 5 words (6 tokens)
	Short Synopsis: 37 words (40 tokens)
	Key Points: 85 words (93 tokens)
OpenAI usage for final highlight:
	prompt_tokens: 564
	completion_tokens: 169
	total_tokens: 733
Length of final highlight, in words: 141, in tokens: 160
Length of final summary, in words: 415, in tokens: 454
Length of final result, in words: 561, in tokens: 621



#### Title (5 words)
Diffusion Modeling in Neural Networks

#### Short Synopsis (37 words)
Diffusion modeling is a technique used in

## Youtube

In [40]:
# if 'openai_usages' in globals():
#     del openai_usages

# tic = time.time()    
# video_id = 'UVn2NroKQCw' # Using LangChain Output Parsers to get what you want out of LLMs
# video_id = 'KUDn7bVyIfc' # Converting a LangChain App from OpenAI to OpenSource
# video_id = 'uzJX9Wkp0Qc' # The Secrets to Become a Better Software Engineer 6:46
# video_id = '6UFtRwWnHws' # Building a LangChain Custom Medical Agent with Memory 17:46
# video_id = 'uzJX9Wkp0Qc' # The Secrets to Become a Better Software Engineer 6:46
# video_id = 'qaPMdcCqtWk' # 5 Levels Of LLM Summarizing: Novice to Expert 19:18
# video_id = 'MPrJF3F4mHc' # Attack and Detect: VulnDC:2 vs Splunk & Security Onion 1:27:45
# video_id = 'EmNQuK-E0kI' # What is The Quantum Wave Function, Exactly? 13:04
# video_id = 'to7vCdkLi4s' # Reinforcement Learning with Augmented Data (Paper Explained) 22:14
# video_id = 'x8pW19wKfXQ' # RWKV: Reinventing RNNs for the Transformer Era (Paper Explained) 1:02:16
# video_id = 'ddG2fM9i4Kk' # OpenAssistant RELEASED! The world's best open-source Chat AI! 21:05
# video_id = '4Cclp6yPDuw' # Scaling Transformer to 1M tokens and beyond with RMT (Paper Explained) 24:33
# video_id = 'Eug2clsLtFs' # Understanding ReACT with LangChain 21:09
# video_id = 'ghzsBm8vOms' # What is Platform Engineering and how it fits into DevOps and Cloud world 42:41
# video_id = '87ZFvJ7_-n0' # How to build an OpenAPI Specification using YAML? 15:30
# video_id = 'mViFmjcDOoA' # OpenAPI 3.0 Tutorial| Swagger Tutorial For Beginners | Design REST API Using Swagger Editor 24:57
# video_id = 'z2aCZBAtWXs' # What can you do with 16K tokens in LangChain? 16:54
# video_id = 'DiLPn_ldAAQ' # Lecture 21 (update): SHA-3 Hash Function by Christof Paar 1:15:06
# video_id = 'aMckXIqqzeI' # How Diffie-Hellman Fails in Practice 1:13:19
# video_id = 'HoKDTa5jHvg' # Diffusion Models | Paper Explanation | Math Explained 33:26
# video_id = 'yfgfJAkzliw' # Jared Zoneraich - Future Of Prompt Engineering, Management, and Collaboration 24:50
# video_id = '9zEXov_L0os' # Hands-on Ransomware: Exploring Cybercrime 43:27
# video_id = 'dXxQ0LR-3Hg' # Chat with Multiple PDFs | Langchain app tutorial 1:07:29
# video_id = '4KXK6c6TVXQ' # OpenAI Functions + LangChain : Building a Multi Tool Agent 18:51
# video_id = 'I4n0Wj2PHQA' # OpenAI's Game Changing Updates. New Features! Bigger Savings! 12:16
# video_id = '0lOSvOoF2to' # OpenAI GPT-4 Function Calling: Unlimited Potential 23:48
# video_id = 'OdIHUdQ1-eQ' # PyPDF2 Crash Course - Working with PDFs in Python 52:19
# video_id = 'a8hMgIcUEnE' # Tagging and Extraction - Classification using OpenAI Functions 16:13
# video_id = 'Tkijsu129M0' # GPT-4 solves MIT Exam with 100% ACCURACY 31:04
# video_id = 'ut5kp56wW_4' # Tree of Thoughts: Deliberate Problem Solving with Large Language Models (Full Paper Review) 29:28
# video_id = 'FcUAbIQH_XY' # 数字秩序 14:58
# video_id = 'pEkxRQFNAs4' # Extract Topics From Video/Audio With LLMs 17:33
# video_id = '2lnW1PSB2_g' # How to write Tree of Thoughts Prompts 11:35
# video_id = '4P-hPldEUiE' # Stable Diffusion Automation: Turbocharge Your AI Image Generation 10:27
# video_id = 'g2BRIuln4uc' # Intuition Behind Self-Attention Mechanism in Transformer Networks 39:24
# video_id = 'ANszao6YQuM' # Stanford CS230: Deep Learning | Autumn 2018 | Lecture 4 - Adversarial Attacks / GANs 1:22:59
# video_id = 'cQYmePtLAT0' # Adversarial Attack and Defense on Deep Learning 3:17
# video_id = 'zk-E2NKFjk4' # Introduction to Adversarial Attack on Machine learning model 1:36:55

# video_id = '' # 
# video_id = '' # 
# video_id = '' # 
# video_id = '' # 
# video_id = '3iAaySrjZ4w' # The future of Bitcoin 33:10

# final_result = do_youtube_summary(video_id, num_highlights=7)
# toc = time.time()
# print(f"\nAltogether time consumed: {toc-tic:,.1f} seconds")

Num of words: 4,644
Num of tokens: 5,094
To summary text length: 4,644 (num of tokens: 5,094)
batch_summary model = gpt-3.5-turbo-16k-0613
Inside _get_completion_func: num_prompt_tokens = 5,219, max_tokens = 10,781, model = gpt-3.5-turbo-16k-0613, context_length = 16,000
Time consumed for final summarization: 14.4s
To highlight text length: 672 (num of tokens: 778)
Number of tokens for final highlight: 786 (words: 672). Highlight model: gpt-3.5-turbo-0613
Time consumed for final highlight: 7.3s
Final highlight:
	Title: 11 words (11 tokens)
	Short Synopsis: 115 words (127 tokens)
	Key Points: 103 words (130 tokens)
OpenAI usage for final highlight:
	prompt_tokens: 888
	completion_tokens: 296
	total_tokens: 1,184
Length of final highlight, in words: 243, in tokens: 289
Length of final summary, in words: 672, in tokens: 778
Length of final result, in words: 920, in tokens: 1,075



#### Title (11 words)
The Future of Bitcoin and the Economic War for Wealth Redistribution

#### Short Synop

## PDF

In [33]:
# if 'openai_usages' in globals():
#     del openai_usages

# tic = time.time()    
# pdf_resource, pdf_name = "https://arxiv.org/pdf/2210.03629.pdf", "REACT: SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS" # Page 33
# pdf_resource, pdf_name = "https://arxiv.org/pdf/2305.13860.pdf", "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study" # Page 12
# pdf_resource, pdf_name, startPage, endPage = "https://arxiv.org/pdf/2210.16886.pdf", "DiffusER: Discrete Diffusion via Edit-Based Reconstruction", 1, 10 # Page 13
# pdf_resource, pdf_name, startPage, endPage = "https://arxiv.org/pdf/2201.11903.pdf", "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 1, 19 # Page 43
# pdf_resource, pdf_name, startPage, endPage = "http://hanj.cs.illinois.edu/cs412/bk3/KL-divergence.pdf", "Kullback-Leibler Divergence", 1, 19 # Page 43
# pdf_resource, pdf_name, startPage, endPage = "https://arxiv.org/pdf/1706.03762.pdf", "Attention is all you need", 1, 10 # Page 15

# final_result = do_pdf_summary(pdf_resource, pdf_name=pdf_name, startPage=startPage, endPage=endPage, num_highlights=7)
# toc = time.time()
# print(f"\nAltogether time consumed: {toc-tic:,.1f} seconds")

In [34]:
print(final_result)

#### Title (7 words)
Adversarial Attacks and Defenses in Deep Learning

#### Short Synopsis (24 words)
This research focuses on the security concerns posed by adversarial attacks in deep learning models and explores various defense technologies to enhance their robustness.

#### Key Points (66 words)
- Adversarial attacks manipulate deep learning models to exhibit incorrect behavior
- Adversarial examples can have serious consequences in security-critical areas
- Research efforts have emerged to understand and mitigate adversarial attacks
- Theoretical basis, attack algorithms, and practical application cases are discussed
- Defense technologies include heuristic defenses, adversarial training, denoising, randomization, and certified defenses
- Current status, development trends, and unresolved challenges are analyzed

#### Detailed Summary (344 words)
The research on adversarial attacks and defenses in deep learning has been conducted by the Rehnquist team from Jaejoong University in 