### install dependencies

In terminal run:
- python -m venv venv
-  .\venv\Scripts\activate 
- pip install -r requirements.txt
- pip install -U openai-whisper
- pip install llama-index-embeddings-huggingface
-  pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
- llama-index-llms-llama-cpp
- add ffmeg to scripts

In [2]:
import ffmpeg
import os
import whisper
from docx import Document
import llama_index
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import docx
from docx import Document
import json


  from .autonotebook import tqdm as notebook_tqdm


## Audio Extraction and Transcription

In [6]:
model = whisper.load_model("small")

def extract_audio(video_path, output_folder):
    """
    Extracts audio from a video file and saves it as an MP3 file in the specified output folder.

    Args:
    video_path (str): Path to the input video file.
    output_folder (str): Path to the folder where the output audio file will be saved.
    """
   
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

  
    video_basename = os.path.basename(video_path)
    audio_filename = os.path.splitext(video_basename)[0] + '.mp3'
    audio_path = os.path.join(output_folder, audio_filename)

   
    (
        ffmpeg
        .input(video_path)
        .output(audio_path)
        .run()
    )
    print(f"Audio extraction complete. The file has been saved to: {audio_path}")
    return audio_path

def transcribe_audio(audio_path, output_folder):
    """
    transcribes audio file with whisper and saves it as a docx in the specified output folder
    
    Args:
    audio_path (str): Path to the input video file.
    output_folder (str): Path to the folder where the output docx file will be saved.
    """


    result = model.transcribe(audio_path)  

   
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)


    audio_basename = os.path.basename(audio_path)
    docx_filename = os.path.splitext(audio_basename)[0] + '.docx'  
    output_file_path = os.path.join(output_folder, docx_filename)
    

    doc = Document()
    doc.add_paragraph(result["text"])
    doc.save(output_file_path)


    
    print(f"Transcription saved to {output_file_path}")

    return docx_filename



In [7]:
# generate audio trascription from an input video
#define video path
video_path = r"Videos/A7_Continual_Improvement_Video_Tast_Cakir_Bellmann.mp4"
#define output path to save audio
output_path = r"audio"
audio_path = extract_audio(video_path, output_path)
#define output path to save transcripts
transcripts_folder = "transcripts"
transcript_filename = transcribe_audio(audio_path, transcripts_folder)

Audio extraction complete. The file has been saved to: audio\A7_Continual_Improvement_Video_Tast_Cakir_Bellmann.mp3




Transcription saved to transcripts\A7_Continual_Improvement_Video_Tast_Cakir_Bellmann.docx


## Video Content Analysis using a RAG System with LLMs

In [9]:
# convert prompt format for german_leo_mistral model
def messages_to_prompt(messages):
    prompt = "Du bist ein hilfreicher Assistent. USER: {message.content} ASSISTANT."  
    return prompt

def completion_to_prompt(completion):
    return f" ASSISTANT: {completion}"

#download the model from Huggingface to the folder .\models using this link https://huggingface.co/TheBloke/em_german_leo_mistral-GGUF/blob/main/em_german_leo_mistral.Q4_K_M.gguf
model_path = "models/em_german_leo_mistral.Q4_K_M.gguf"
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# initialze the llm
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=None,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=model_path,
    temperature=0.1,
    max_new_tokens=1000,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    #model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    #completion_to_prompt=completion_to_prompt,
    verbose=True,

#initialize the tokenizer
)

set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

#define the path to the video transript

# load documents
document_path = os.path.join("transcripts", transcript_filename)
documents = SimpleDirectoryReader(
    input_files=[document_path]
).load_data()

# create vector store index
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# set up query engine
query_engine = index.as_query_engine(llm=llm)


#read prompts json

with open('prompts.json', 'r') as file:
    prompts = json.load(file)

#create a new document
doc = Document()


#iterate through rubrics and prompts and generate a response
for rubric_key, prompts in prompts.items():
    responses = {key: [] for key in prompts}

    for criterion, prompt in prompts.items():
        response = query_engine.query(prompt)
        responses[criterion].append(response)
    doc.add_heading(rubric_key, level=1)


    # save the reponses in a docx file
    for category, response_list in responses.items():
        doc.add_heading(category, level=2)
        for response in response_list:
            doc.add_paragraph(str(response))
            doc.add_paragraph() 


# Save the document
document_path = os.path.join("output_feedback", transcript_filename)
doc.save(document_path)
print("feedback generation is done")

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from models/em_german_leo_mistral.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = jphme_em_german_leo_mistral
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attenti

feedback generation is done
