In [1]:
import torch
import os
import time
import json
## Load Config
with open('config/videos.json') as config_file:
    videos = json.load(config_file)
with open('config/name_to_url.json') as config_file:
    name_to_url = json.load(config_file)


In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

torch.cuda.empty_cache()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)

# tokenizer = AutoTokenizer.from_pretrained("TabbyML/StarCoder-7B")
# model = AutoModelForCausalLM.from_pretrained("TabbyML/StarCoder-7B")

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Variable Setup
shots = ["mixtral8x7b", "full-stack"]
torch.cuda.empty_cache()

# folder_no = len(os.listdir(f'data/outputs')) + 1
folder_no = 19
print(f"Folder Number: {folder_no}")
os.makedirs(f'data/outputs/{folder_no}/text', exist_ok=True)
os.makedirs(f'data/outputs/{folder_no}/embeddings', exist_ok=True)
os.makedirs(f'data/outputs/{folder_no}/results', exist_ok=True)

transcripts_dir = "data/transcripts/processed"
prompt_path = "data/prompts/prompt.txt"
oneshots_dir = "data/oneshots"
outputs_dir = f'data/outputs/{folder_no}'

prompt = open(prompt_path, "r").read()
# Write some metadata

with open(f'data/outputs/{folder_no}/metadata.txt', 'a') as f:
    if os.stat(f'data/outputs/{folder_no}/metadata.txt').st_size == 0:
        f.write(f"#####################\n###### METADATA ######\n#####################\n\n")
        f.write(f"Model: mistralai/Mistral-7B-Instruct-v0.1\n")
        f.write(f"Time: {time.time()}\n")
        f.write(f"Videos: {videos}\n")
        f.write(f"Shots: {shots}\n")
        f.write(f"Prompt: {prompt}\n")

Folder Number: 19


In [4]:
f_abridged_transcripts = open(f'data/outputs/{folder_no}/abridged_transcripts.txt', 'a')

def abridge_transcript(transcript: str, chars: int) -> str:
    """ Abridge a transcript to a certain number of characters
    Args:
        transcript (str): The transcript to abridge
        chars (int): The number of characters to abridge to
    Returns:
        str: The abridged transcript
    """
    if len(transcript) <= chars:
        return transcript
    
    output_transcript = ""
    ## Use 20% of chars for the start
    output_transcript += transcript[:int(chars*0.2)]

    # evenly divide the characters in the middle of the transcript and extract in 6 10% chunks
    middle = transcript[int(chars*0.2):-int(chars*0.2)]
    transcript_chunk_size = len(middle) // 6
    abridged_chunk_size = int(chars * 0.1)
    for i in range(6):
        target_chunk_index = transcript_chunk_size * i + (transcript_chunk_size // 2)
        output_transcript += middle[target_chunk_index:target_chunk_index+abridged_chunk_size]

    ## Use 20% of chars for the end
    output_transcript += transcript[-int(chars*0.2):]
    f_abridged_transcripts.write(f"{output_transcript}\n\n")
    return output_transcript



def build_message(shots: list, target: str, shots_char_limit: int, input_char_limit: int) -> list:
    """ Build a message for the model to generate from.
    Args:
        shots (list): List of shots to include in the message
        target (str): The target video to generate the message for
        start (int): The start of the transcript to include
        end (int): The end of the transcript to include
    Returns:
        list: A list of messages to send to the model
    """
    
    messages = []

    f_prompt = open(prompt_path, 'r', encoding='utf-8')
    f_target_transcript = open(f'{transcripts_dir}/{target}.txt', 'r', encoding='utf-8')
    
    prompt = f_prompt.read()
    for shot in shots:
        f_oneshot_transcript = open(f'{transcripts_dir}/{shot}.txt', 'r', encoding='utf-8')
        f_oneshot = open(f'{oneshots_dir}/{shot}.txt', 'r', encoding='utf-8')
        
        f_abridged_transcripts.write(f"#####################\nShot: {shot}\n#####################\n")
        messages.append({"role": "user", "content": f'<Transcript>{abridge_transcript(f_oneshot_transcript.read(), shots_char_limit)}<Instruction>{prompt}'})
        messages.append({"role": "assistant", "content": f_oneshot.read()})

        f_oneshot.close()
        f_oneshot_transcript.close()

    ## Append final message
    f_abridged_transcripts.write(f"#####################\nTarget: {target}\n#####################\n")
    messages.append({"role": "user", "content": f'<Transcript>{abridge_transcript(f_target_transcript.read(), input_char_limit)}<Instruction>{prompt}'})

    f_prompt.close()
    f_target_transcript.close()
    return messages


In [5]:
## All Inference
metadata = open(f'{outputs_dir}/metadata.txt', 'a')
metadata.write(f"\n\n###################\n###### INFERENCE ######\n###################\n")
f_message = open(f'{outputs_dir}/messages.txt', 'w', encoding='utf-8')
temperature = 0.5
top_k = 35
top_p = 0.95

metadata.write(f"Temperature: {temperature}\nTop_k: {top_k}\nTop_p: {top_p}\n")
metadata.close()
metadata = open(f'{outputs_dir}/metadata.txt', 'a')

## open all files in data/transcripts/processed. For each file, create a message using as many shots as given above. 
global_start = time.time()
for file in os.listdir(transcripts_dir):
    target = file.split('.')[0]
    if target == "catan": continue
    metadata.write(f"Processing: {target}\n")
    with open(f'{transcripts_dir}/{target}.txt', 'r', encoding='utf-8') as f_target_transcript:
        messages = build_message(shots, target, 8000, 14500)
        f_message.write(f'{target}: {messages}\n\n')
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)    
    metadata.write(f"    Tokens: {len(inputs[0])}\n")
    print(f"    Tokens: {len(inputs[0])}")
    ## An idea for later, do not include the oneshot prompt if we are only directly using transcript embedding
    # start_token = len(inputs[0]) 

    with torch.no_grad():
        start_time = time.time()
        text = model.generate(inputs, max_new_tokens=3500, do_sample=True, pad_token_id=tokenizer.eos_token_id, temperature=temperature, top_k=top_k, top_p=top_p)
        decoded = tokenizer.batch_decode(text)
        print(f'Decoding finished: {target} in {round(time.time() - start_time, 3)} seconds')   
        metadata.write(f"    Decoding took: {round(time.time() - start_time, 3)} seconds\n")
    ## obtain all tokens after the second "[/INST]" and remove the </s> token. Write this as our output.
    with open(f'{outputs_dir}/text/{target}.json', 'w', encoding='utf-8') as f:
        f.write(decoded[0].split('[/INST]')[-1][1:-4])

    ## Free up memory
    del text
    del decoded
    torch.cuda.empty_cache()
    print(f'cuda memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB', f'cuda memory cached: {torch.cuda.memory_reserved()/1024**3:.2f} GB')
    metadata.write(f'    cuda memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB, cuda memory cached: {torch.cuda.memory_reserved()/1024**3:.2f} GB\n')

metadata.write(f'Global runtime: {round(time.time() - global_start, 3)} seconds')
metadata.close()
f_message.close()

f_abridged_transcripts.close()

    Tokens: 10695
Decoding finished: dpo in 42.646 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
    Tokens: 10689
Decoding finished: full-stack in 50.917 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
    Tokens: 10736
Decoding finished: hashing in 43.608 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
    Tokens: 8325
Decoding finished: localized_deployment in 60.502 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
    Tokens: 10611
Decoding finished: mixtral8x7b in 50.147 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
    Tokens: 10741
Decoding finished: mlops_llm_eval in 53.41 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
    Tokens: 8210
Decoding finished: react in 69.219 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
    Tokens: 8112
Decoding finished: rust in 63.083 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.

In [6]:
## Often behavior is seen where extra characters are added in random positions. Use this function to attempt to clean them up. 
def recursive_loads(text, depth):
    if depth == 10:
        return json.loads(text)
    try:
        return json.loads(text)
    except Exception as e:
        # find "line", "column" and "char" in the error message
        error = str(e)
        line = int(error.split("line ")[1].split(" ")[0])
        column = int(error.split("column ")[1].split(" ")[0])
        char = int(error.split("char ")[1].split(" ")[0][:-1])
        print(f"line: {line}, column: {column}, char: {char}")
        
        ## delete the character at the position
        text = text[:char] + text[char+1:]
        return recursive_loads(text, depth+1)

# file_name = "sql_backend.json"
# with open(f'data/outputs/{folder_no}/text/{file_name}', 'r', encoding='utf-8') as f:
#     text = f.read()
#     print(recursive_loads(text, 0))

In [7]:
## Eval outputs as correct json with all fields

f_metadata = open(f'{outputs_dir}/metadata.txt', 'a')
f_metadata.write(f'\n\n#####################\n##### Validating outputs ######\n#####################\n')

for file_name in os.listdir(f'{outputs_dir}/text'):
    file = open(f'{outputs_dir}/text/{file_name}', 'r', encoding='utf-8')
    file_text = file.read()
    file.close()
    try:
        output_json = recursive_loads(file_text, 0)
        errors = []
        if "introduction" not in output_json:
            errors.append("No introduction field")
        if "sections" not in output_json:
            errors.append("No sections field")
        for section in output_json["sections"]:
            if "title" not in section:
                errors.append("No title field in section")
            if "content" not in section:
                errors.append("No content field in section")
            if "topics" not in section:
                errors.append("No topics field in section")
        if "topics" not in output_json:
            errors.append("No final topics field")
        if "general topics" not in output_json:
            errors.append("No general topics field")
        for topic in output_json["general topics"]:
            if "name" not in topic:
                errors.append("No name field in general topic")
            if "complexity" not in topic:
                errors.append("No complexity field in general topic")
            if int(topic["complexity"]) > 1 or int(topic["complexity"]) < 0:
                errors.append("Complexity is not an integer")
        
    except Exception as e:
        print(f'## BAD ERROR. Error in file {file_name}: {e}')

        f_metadata.write(f'Error in file {file_name}: Type: {type(e)} Error: {e}\n')
    
    format_errors = '\n'.join(errors)
    f_metadata.write(f'File {file_name} errors: \n{format_errors}\n')

f_metadata.close()
    
    

## BAD ERROR. Error in file dpo.json: invalid literal for int() with base 10: '0.80'
## BAD ERROR. Error in file mixtral8x7b.json: invalid literal for int() with base 10: '0.70'
## BAD ERROR. Error in file mlops_llm_eval.json: invalid literal for int() with base 10: '1.10'
## BAD ERROR. Error in file sql_backend.json: invalid literal for int() with base 10: '0.35'
## BAD ERROR. Error in file threads_connections.json: invalid literal for int() with base 10: '4.00'
line: 30, column: 15, char: 7974
line: 30, column: 16, char: 7975
line: 30, column: 16, char: 7975
line: 30, column: 16, char: 7975
line: 30, column: 16, char: 7975
line: 30, column: 16, char: 7975
line: 30, column: 16, char: 7975
line: 30, column: 16, char: 7975
line: 30, column: 16, char: 7975
line: 38, column: 15, char: 11901
## BAD ERROR. Error in file react_svelte.json: Expecting ',' delimiter: line 38 column 16 (char 11902)


In [8]:
# Create embeddings from outputs

def createEmbeddings(outputs_dir: str):
    """ Create embeddings from the outputs of the model
    Args:
        outputs_dir (str): The directory containing the outputs
    """
    
    metadata = open(f'{outputs_dir}/metadata.txt', 'a')

    for file in os.listdir(f'{outputs_dir}/text'):
        print(f'cuda memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB', f'cuda memory cached: {torch.cuda.memory_cached()/1024**3:.2f} GB')

        target = file.split('.')[0]
        start_time = time.time()
        with open(f'{outputs_dir}/text/{target}.json', 'r', encoding='utf-8') as f:
            inputs = tokenizer(f.read(), return_tensors="pt").to(device)     
        with torch.no_grad():
            hidden = model(**inputs, output_hidden_states=True)
        print(f'Hidden states finished: {target} in {round(time.time() - start_time, 3)} seconds')
        metadata.write(f"Target: {target}, Hidden states took: {round(time.time() - start_time, 3)} seconds\n")

        ## Write hidden states
        ## Cut off all tokens before the first start token to remove oneshot and prompt.
        tensor_t = hidden.hidden_states[-1].transpose(1,2)
        # max_pool_t = torch.nn.functional.max_pool1d(tensor_t, tensor_t.shape[2])
        # avg_pool_t = torch.nn.functional.avg_pool1d(tensor_t, tensor_t.shape[2])
        max_pool = torch.nn.functional.max_pool1d(tensor_t, tensor_t.shape[2]).transpose(1, 2).squeeze()
        avg_pool = torch.nn.functional.avg_pool1d(tensor_t, tensor_t.shape[2]).transpose(1, 2).squeeze()
        print(max_pool.shape)
        torch.save(max_pool, f'{outputs_dir}/embeddings/max_{target}.pt')
        torch.save(avg_pool, f'{outputs_dir}/embeddings/avg_{target}.pt')

        print(f'Target: {target} finished. Wrote to file.')
        metadata.write(f'    Target: {target} tensor finished. Wrote to file.\n')

        del inputs
        del hidden
        del tensor_t
        del max_pool
        del avg_pool
        print(f'cuda memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB', f'cuda memory cached: {torch.cuda.memory_cached()/1024**3:.2f} GB')
        torch.cuda.empty_cache()

    metadata.close()

In [9]:
## Run to obtain embeddings
createEmbeddings(f'data/outputs/{folder_no}')


## LETS CREATE A TON
# for str_num in os.listdir('data/outputs'):
#     num = int(str_num)
#     if num < 11: continue # only use recent 
    
#     os.makedirs(f'data/outputs/{num}/embeddings', exist_ok=True)

#     # if embeddings dir is empty
#     if len(os.listdir(f'data/outputs/{num}/embeddings')) == 0:
#         createEmbeddings(f'data/outputs/{num}')

cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB




Hidden states finished: dpo in 0.146 seconds
torch.Size([4096])
Target: dpo finished. Wrote to file.
cuda memory allocated: 14.00 GB cuda memory cached: 14.52 GB
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
Hidden states finished: full-stack in 0.177 seconds
torch.Size([4096])
Target: full-stack finished. Wrote to file.
cuda memory allocated: 14.00 GB cuda memory cached: 14.58 GB
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
Hidden states finished: hashing in 0.147 seconds
torch.Size([4096])
Target: hashing finished. Wrote to file.
cuda memory allocated: 14.00 GB cuda memory cached: 14.52 GB
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
Hidden states finished: localized_deployment in 0.232 seconds
torch.Size([4096])
Target: localized_deployment finished. Wrote to file.
cuda memory allocated: 14.00 GB cuda memory cached: 14.70 GB
cuda memory allocated: 14.00 GB cuda memory cached: 14.40 GB
Hidden states finished: mixtral8x7b in 0.186 seconds

In [16]:
### catan.txt
## Clean transcript, with new lines
## Char: 9388, Word: 1840, Tokens: 2483, Runtime: 3m 4.2s on Pytorch MPS, T/S = 13.5

## Clean transcript, no new lines
## Char: 9388, Word: 1840, Tokens: 2174, Runtime: 2m 43.9s on Pytorch MPS, T/S = 13.25

## Clean transcript, charging
## Char: 9388, Word: 1840, Tokens: 2174, Runtime: 2m 32.8s on Pytorch MPS

#### On A10s
# 8,000-token limit for Mistral-7B

### catan.txt
## Tokens: 2174, Runtime: 48.8s, T/S = 44.55

### mixtral8x7b.txt
## Tokens: 16981, Runtime: 13m 8.4s, T/S = 21.59

### mlops_llm_eval.txt
## Tokens: 10993, Runtime: 7m 5.1s, T/S = 25.87

### typescript_fireship.txt
## Tokens: 1042, Runtime: 27.6s, T/S = 37.75

### localized_deployment.txt | Tokens: 892,
## A10 // Runtime: 21.6s, T/S = 41.3
## M1  // Runtime:
