In [None]:
import torch
import os
import json
## Load Config
with open('config/videos.json') as config_file:
    videos = json.load(config_file)
with open('config/name_to_url.json') as config_file:
    name_to_url = json.load(config_file)


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

torch.cuda.empty_cache()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.float16).to(device)

# tokenizer = AutoTokenizer.from_pretrained("TabbyML/StarCoder-7B")
# model = AutoModelForCausalLM.from_pretrained("TabbyML/StarCoder-7B")

In [None]:
## Single inference
name = "hashing"
input_name = "mlops_llm_eval"
with open(f'data/transcripts/processed/{name}.txt') as f:
    with open(f'prompt.txt', 'r') as f_prompt:
        with open(f'data/input/{name}.txt', 'r') as f_input:
            with open(f'data/transcripts/processed/{input_name}.txt', 'r') as f2:
                prompt = f_prompt.read()
                messages = [
                    {"role": "user", "content": f'{prompt}{f.read()[0:5000]}'},
                    {"role": "assistant", "content": f_input.read()},
                    {"role": "user", "content": f'{prompt}{f2.read()[0:5000]}'}
                ]
                inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)



In [None]:
## All Inference
prompt = ""
oneshot = ""
with open(f'prompt.txt', 'r') as f_prompt:
    prompt = f_prompt.read()
with open(f'data/input/oneshot.txt') as f_os:
    oneshot = f_os.read()

# name, input
inputs_dict = {}

# open all files in data/transcripts/clean
for file in os.listdir('data/transcripts/processed'):
    with open(f'data/transcripts/processed/{file}', 'r') as f:
        name = file.split('.')[0]
        messages = [
            {"role": "user", "content": f'{prompt}{f.read()[0:5000]}'},
            {"role": "assistant", "content": oneshot}
        ]
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
        inputs_dict[name] = inputs
        

for name, input in inputs_dict.items():
    print(f'Name: {name}, Tokens: {input["input_ids"].shape}')

In [None]:
with torch.no_grad():
    generated_ids = model.generate(inputs, max_new_tokens=1000, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)


In [None]:
with open(f'data/output/{input_name}1.txt', 'w') as f:
    f.write(decoded[0])

In [None]:
### catan.txt
## Clean transcript, with new lines
## Char: 9388, Word: 1840, Tokens: 2483, Runtime: 3m 4.2s on Pytorch MPS, T/S = 13.5

## Clean transcript, no new lines
## Char: 9388, Word: 1840, Tokens: 2174, Runtime: 2m 43.9s on Pytorch MPS, T/S = 13.25

## Clean transcript, charging
## Char: 9388, Word: 1840, Tokens: 2174, Runtime: 2m 32.8s on Pytorch MPS

#### On A10s
# 8,000-token limit for Mistral-7B

### catan.txt
## Tokens: 2174, Runtime: 48.8s, T/S = 44.55

### mixtral8x7b.txt
## Tokens: 16981, Runtime: 13m 8.4s, T/S = 21.59

### mlops_llm_eval.txt
## Tokens: 10993, Runtime: 7m 5.1s, T/S = 25.87

### typescript_fireship.txt
## Tokens: 1042, Runtime: 27.6s, T/S = 37.75

### localized_deployment.txt | Tokens: 892,
## A10 // Runtime: 21.6s, T/S = 41.3
## M1  // Runtime:


In [None]:
output_dict = {}

In [None]:
# Generate the output
# name = "localized_deployment.txt"
for name in inputs_dict.keys():
    inputs = inputs_dict[name]

    if os.path.exists(f'data/embeddings/{name}.pt'):
        continue
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        output_dict[name] = outputs
        torch.save(outputs.hidden_states[-1], f'data/embeddings/{name}.pt')

In [None]:
output = output_dict["catan"]