### Resource use 

- [Reference Video](https://www.youtube.com/watch?v=u2diEa4VT4M&t=83s&ab_channel=AllAboutAI)
- [Run Llama 2 Locally with Python](https://swharden.com/blog/2023-07-29-ai-chat-locally-with-python/)
- [llama-cpp-python](https://pypi.org/project/llama-cpp-python/)
    - Tutorial 
        - https://www.datacamp.com/tutorial/llama-cpp-tutorial
- [Mistral-7B-Instruct-v0.1-GGUF](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF)


## Model Installing 

In [None]:
# pip install llama-cpp-python

# version check 
# pip show llama-cpp-python

### Test Load Model

In [None]:
# # load the large language model file
# from llama_cpp import Llama
# LLM = Llama(model_path="model/mistral-7b-instruct-v0.1.Q5_K_M.gguf")

# # create a text prompt
# prompt = "Q: What are the names of the days of the week?"

# # generate a response (takes several seconds)
# output = LLM(prompt)

# # display the response
# print(prompt)
# print(output["choices"][0]["text"])

## Train Model - Fine Tuning

[ShortCut Key](https://digitalhumanities.hkust.edu.hk/tutorials/jupyter-notebook-tips-and-shortcuts/)

<hr>

### Reference 
- [Guide to Fine-Tuning LLMs](https://www.datacamp.com/tutorial/fine-tuning-large-language-models)

### Read data 

In [None]:
# pip install trl 
# pip install peft
# pip install torch
# pip install datasets

In [None]:
# pip show torch

Error facing when installing trl, peft & torch

ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus
    we cannot accurately determine which files belong to it which would lead to
    only a partial uninstall.

[Pandas Official DataFrame Tutorial](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)
<br><br>
[W3School Pandas DataFrame](https://www.w3schools.com/python/pandas/pandas_dataframes.asp)

In [None]:
pip install pandas

In [None]:
from datasets import load_dataset
import pandas as pd

# dataset = load_dataset("mteb/tweet_sentiment_extraction")

# print(dataset)

In [None]:
# show data in table form
df = pd.DataFrame(dataset['train'])
display(df)

In [None]:
# directly convert to pandas format without create library  
pandas_format = dataset["train"].to_pandas()
display(pandas_format.size) # check size

<code>display()</code> can directly display in table form ( more better ) <br>
- <code>DataFrame.head()</code> first 5 row <br>
- <code>DataFrame.tail()</code> last 5 row <br>

<code>print()</code> will only display without formating <br>

# Speech To Text 

In [None]:
# package installation
# pip install SpeechRecognition 

In [None]:
import speech_recognition as speech

from llama_cpp import Llama

In [None]:
# global variable - capture speech
recognizer = speech.Recognizer()

# global variable - conversation [end user question][machine response]
convo = []

# global variable - common sentences
NOT_UNDERSTAND = "Sorry, I didn't understand that. I only can understanding english"
ERROR = "Unexpected Error Occurs. Message to Developer > "
NOT_CLEAR = "I'm sorry I didn't catch what you said. Could you repeat it, please"

In [None]:
# function to store conversation 
def inRange(index):
    return index >= 0 and index < len(convo) # true of false 

# setter 
def setQuestion(ques): 
    convo.append([str(ques)])
    
# function to store conversation 
def setResponse(res):
    convo[len(convo) - 1].append(str(res))

## ( int - index , string )
def setResponseWithNum(index, res):
    if inRange(index):
        convo[index].append(str(res))
        return True
    return False
    
## ( int - index , string )
def setResponseWithNum(index, res):
    if inRange(index):
        convo[index].append(str(res))
        return True
    return False 

# getter 
def getLatestQuestion():
    return convo[len(convo) - 1][0]    

def getLatestResponse():
    return convo[len(convo) - 1][1]

def getQuestionWithNum(index):
    if inRange(index):
        return convo[index][0]
    # out of index
    return None 

def getResponseWithNum(index):
    if inRange(index):
        return convo[index][1]
    # out of index
    return None 

# Example of using
# setQuestion("hello")
# setResponse("Hello how can i help u")
# [['hello', 'Hello how can i help u']]

In [None]:
# function to capture speech as input 
def capature_speech():
    try:
        with speech.Microphone() as mic:
            print("listening")
            audio = recognizer.listen(mic, timeout=3)
        return audio
    except speech.WaitTimeoutError as e:
        # within the time limit doesnot have any sound 
        raise speech.WaitTimeoutError(e)

# ******** Error to be handle ******** 
# WaitTimeoutError: listening timed out while waiting for phrase to start
# Define : no talking when listening 

## ⭐Error Catching List 

- [ x ] speech.WaitTimeoutError
- [ x ] speech.RequestError
- [ x ] speech.UnknownValueError

https://rollbar.com/blog/throwing-exceptions-in-python/

In [None]:
# converting speech to text
def convert_speechToText(audio):
    text = ""
    try: 
        # converting 
        text = recognizer.recognize_google(audio)
        
        return text
    except speech.UnknownValueError as e:
        # unknown language / no speech / sound  
        raise speech.UnknownValueError(e)
    except speech.RequestError as e:
        raise speech.RequestError(e)

In [None]:
# load the large language model file
def load_model():
    return Llama(model_path="model/mistral-7b-instruct-v0.1.Q5_K_M.gguf")

LLM = load_model()

In [None]:
# only when the audio convert to text successful 
def generate_output(input):
    # generate a response (takes several seconds)
    output = LLM(input)
    text = output["choices"][0]["text"].strip()
    return text
    

In [None]:
def receive_inputVoice(): 
    text = ""
    
    try:
        # create a text prompt
        audio = capature_speech()
        text = convert_speechToText(audio)

        # store question 
        setQuestion(text)    
        
        return True

    except speech.WaitTimeoutError as e:
        # not sound when listening cause timeout
        setQuestion("Problem occurs when receiving input")
        setResponse(NOT_CLEAR)
    except speech.UnknownValueError as e:
        # cause when convert problem
        setQuestion("Problem occurs when receiving input")
        setResponse(NOT_UNDERSTAND)
    except speech.RequestError as e:
        # IDK
        setQuestion("Problem occurs when receiving input")
        setResponse(ERROR + format(e))
    except Exception as e: 
        setQuestion("Problem occurs when receiving input")
        setResponse(format(e))
    return False # input get unsuccessful - no continuos processing 


# Business suggest

- [The best post that I have ever seen](https://stackoverflow.com/questions/65199011/is-there-a-way-to-check-similarity-between-two-full-sentences-in-python)

In [None]:
# # Setup
# pip install spacy

# python -m spacy download en_core_web_md

## Check Similarity 

In [None]:
# global variable - check similarity 
Res_keyword = ["restaurant", "suggest", "top restaurant", "few", "some", "lauch"]
Res_sentence = ["can you suggest a few restaurant", "which are the top restaurant in", "please help me to found a restaurant in Malaysia, that have high rating"]  


In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

def get_similarity(sent1, sent2):
    doc1 = nlp(sent1)
    doc2 = nlp(sent2)
    print(doc1.similarity(doc2))
    return doc1.similarity(doc2)

def isSimilar(sent1, sent2):
    if get_similarity(sent1.lower(), sent2.lower()) > 0.70:
        return True
    return False



# print(get_similarity('can you suggest a few restaurant in', 'please help me to found a restaurant in Malaysia, that have high rating'))

In [None]:
# rule based for Hi and Hello
rule = [["Hello", "Hello ~ how can I help you"], 
       ["Who you are", "I am a Large Language Model - LLM create by Mistral 7b"]]

def checkRegard(input):
    if isSimilar(rule[0][0], input):
        setResponse(rule[0][1])
        return True
    elif isSimilar(rule[1][0], input):
        setResponse(rule[1][1])
        return True

## Convert convo into PDF

In [None]:
# pip install ironpdf

# pip show ironpdf

# !python -m pip uninstall ironpdf --yes

In [None]:
pip install fpdf

In [None]:
from fpdf import FPDF
import datetime as date

# save FPDF() class into a 
# variable pdf
pdf = FPDF()
 
# Add a page
pdf.add_page()

# set style and size of font 
# that you want in the pdf
pdf.set_font("Arial", size = 15)

for qna in convo:
    pdf.cell(200, 10, txt = "You : " + qna[0][0], 
         ln = 1, align = 'L')
    pdf.cell(200, 10, txt = "Robot : " + qna[0][1], 
         ln = 1, align = 'L')

# save the pdf with name .pdf
date = date.datetime.now()
pdf.output(f"convo_{date.year}{date.month}{date.day}{date.hour}{date.second}{date.microsecond}.pdf")  

# Text to Speech

In [None]:
pip install pyttsx3 # sound output 

https://hackernoon.com/an-essential-python-text-to-speech-tutorial-using-the-pyttsx3-library

In [None]:
# text-to-speech setting 
import pyttsx3 as tts

def machineInitSetting():
    volume = machine.getProperty('volume')
    machine.setProperty('volume', volume+1.00)
    voices = machine.getProperty('voices')
    machine.setProperty('voice', voices[0].id)

def generate_sound(res):
    machine.say(res)
    machine.runAndWait()

machine = tts.init()
machineInitSetting()

## OpenVoice Problem
- cannot install torch
- https://www.youtube.com/watch?v=1ec-jOlxt_E&ab_channel=WingnutLabs
- https://www.youtube.com/watch?v=dLNN36hU06M&ab_channel=MG
- https://blog.unrealspeech.com/openvoice-completed-guide/
- https://github.com/myshell-ai/OpenVoice/blob/main/demo_part1.ipynb
- https://github.com/myshell-ai/OpenVoice/issues/98

# Data Cleaning

In [None]:
from datasets import load_dataset
import pandas as pd
import csv



# Main

In [None]:
import json
from difflib import get_close_matches

def load_knowledge_base(file_path: str) -> dict:
    with open(file_path, 'r') as file:
        data: dict = json.load(file)
    return data


def save_knowledge_base(file_path: str, data: dict):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=2)


def find_best_match(user_question: str, questions: list[str]) -> str | None:
    # 60 % similiar res
    matches: list = get_close_matches(user_question, questions, n=1, cutoff=0.6)
    return matches[0] if matches else None



def get_answer_for_question(question: str, knowledge_base: dict) -> str | None:
    for q in knowledge_base["questions"]:
        if q["question"] == question:
            return q["answer"]

def chat_bot():
    knowledge_base: dict = load_knowledge_base('knowledge_base.json')
        
    while True:
        user_input: str = input('You : ')
            
        if user_input.lower() == 'quit':
            break
        
        # search best match inside json file 
        best_match: str | None = find_best_match(user_input, [q['question'] for q in knowledge_base['questions']])
        
        if best_match:
            answer: str = get_answer_for_question(best_match, knowledge_base)
            print(f'Bot : {answer}')
            generate_sound(answer)
        else:
            # let machine to make a response 
            answer = generate_output(user_input)
            print(f'Bot : {answer}')
            generate_sound(answer)
#             print('Bot : I dont know the answer')
#             new_answer: str = input('Type the answer or "skip" to skip: ')
                
#             if new_answer.lower() != 'skip':
#                 knowledge_base['questions'].append({"question": user_input, "answer": new_answer})
#                 save_knowledge_base('knowledge_base.json', knowledge_base)
#                 print('Bot : Thank you! I learned a new response!')
                
if __name__ == '__main__':
    chat_bot()
        

In [None]:
import torch
