### Resource use 

- [Reference Video](https://www.youtube.com/watch?v=u2diEa4VT4M&t=83s&ab_channel=AllAboutAI)
- [Run Llama 2 Locally with Python](https://swharden.com/blog/2023-07-29-ai-chat-locally-with-python/)
- [llama-cpp-python](https://pypi.org/project/llama-cpp-python/)
    - Tutorial 
        - https://www.datacamp.com/tutorial/llama-cpp-tutorial
- [Mistral-7B-Instruct-v0.1-GGUF](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF)


## Model Installing 

In [None]:
# pip install llama-cpp-python

# version check 
# pip show llama-cpp-python

### Test Load Model

In [None]:
# load the large language model file
from llama_cpp import Llama
LLM = Llama(model_path="model/mistral-7b-instruct-v0.1.Q5_K_M.gguf")

# create a text prompt
prompt = "Q: What are the names of the days of the week?"

# generate a response (takes several seconds)
output = LLM(prompt)

# display the response
print(prompt)
print(output["choices"][0]["text"])

## Train Model - Fine Tuning

[ShortCut Key](https://digitalhumanities.hkust.edu.hk/tutorials/jupyter-notebook-tips-and-shortcuts/)

<br>
https://www.activestate.com/resources/quick-reads/how-to-access-a-row-in-a-dataframe/

<hr>

### Reference 
- [Guide to Fine-Tuning LLMs](https://www.datacamp.com/tutorial/fine-tuning-large-language-models)

# Clean Dataset

In [None]:
import pandas as pd
import numpy as np

# load model
from llama_cpp import Llama
LLM = Llama(model_path="model/mistral-7b-instruct-v0.1.Q5_K_M.gguf")

# open dataset 
business_json_path = business_json_path = 'dataset/yelp_academic_dataset_business.json'
df_b = pd.read_json(business_json_path, lines=True)

# convert json to csv
csv_name = "dataset/business.csv"
df_b.to_csv(csv_name, index=False)

In [None]:
# load dataset
business = pd.read_csv(csv_name, index_col='business_id')

display(business)
display(business.shape)

In [None]:
# filter dataset - only take categories include restaurant and food & isOpen = 1 
# categories = pd.read_csv('business.csv', index_col='business_id')

# check this business categories have food / restaurant 
def isRestaurant(categories: str):
    if type(categories) == str:
        categories = categories.lower()
        if 'restaurants' in categories or 'food' in categories:
            return True
    return False

def isOpen(open: int):
    if open == 1:
        return True
    return False

row, col = business.shape

index : int = []

for i in range(len(business)):
    if isRestaurant(business.iloc[i]['categories']) == False:
        # print(categories.iloc[i]['categories'])
        index.append(i)
    elif isOpen(business.iloc[i]['is_open']) == False:
        index.append(i)

business.drop(business.index[index], axis=0, inplace=True)

In [None]:
display(business)

In [None]:
# export clean data
business.to_csv("dataset/business.csv")
print(business.shape)

# Prepare Training Dataset 

https://discuss.huggingface.co/t/from-pandas-dataframe-to-huggingface-dataset/9322/4

In [None]:
from transformers import GPT2Tokenizer
from datasets import load_dataset

# Loading the dataset to train our model
dataset = load_dataset("mteb/tweet_sentiment_extraction")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
   return tokenizer(examples["text"], padding="max_length", truncation=True)

print(dataset)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

print(tokenized_datasets)
print(tokenized_datasets['train'])
display(tokenized_datasets['train'])

display(pd.DataFrame.from_dict(tokenized_datasets['train']))

In [None]:
## -- above is reference

In [None]:
# split dataset into train and test
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict
import math

dataframe = pd.read_csv("dataset/business.csv", index_col='business_id')

row, col = dataframe.shape
# 70% of row become train data
train_range = math.ceil(row * 0.7)

tdf = dataframe.iloc[:train_range]
vdf = dataframe.iloc[train_range:]
tds = Dataset.from_pandas(tdf)
vds = Dataset.from_pandas(vdf)


dataset = DatasetDict()

dataset['train'] = tds
dataset['test'] = vds

# print(ds)

In [None]:
# due to cannot using torch to train own model - at here using gpt2 as replace to do training
from transformers import GPT2Tokenizer
from datasets import load_dataset

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
   return tokenizer(examples["business_id"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

print(tokenized_datasets)
print(tokenized_datasets['train'])
display(tokenized_datasets['train'])

display(pd.DataFrame.from_dict(tokenized_datasets['train']))

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)

In [None]:
pip install evaluate

In [None]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
pip install transformers[torch]

In [None]:
import evaluate

trainer.evaluate()

In [None]:
pip install ctransformers

In [None]:
## dont runnnnnnnnnnnnnn

from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GGUF", model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", model_type="mistral", gpu_layers=50)

print(llm("AI is going to"))

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained("model/mistral-7b-instruct-v0.1.Q5_K_M.gguf")
tokenizer = AutoTokenizer.from_pretrained("model/mistral-7b-instruct-v0.1.Q5_K_M.gguf")

text = """<s>[INST] What is your favourite condiment? [/INST]
Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s>
[INST] Do you have mayonnaise recipes? [/INST]"""

encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False)

model_inputs = encodeds.to(device)
model.to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

In [None]:
pip show torch

### Read data 

Error facing when installing trl, peft & torch

ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus
    we cannot accurately determine which files belong to it which would lead to
    only a partial uninstall.

[Pandas Official DataFrame Tutorial](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)
<br><br>
[W3School Pandas DataFrame](https://www.w3schools.com/python/pandas/pandas_dataframes.asp)

# Speech To Text 

In [None]:
import speech_recognition as speech

from llama_cpp import Llama

In [None]:
# global variable - capture speech
recognizer = speech.Recognizer()

# global variable - conversation [end user question][machine response]
convo = []

# global variable - common sentences
NOT_UNDERSTAND = "Sorry, I didn't understand that. I only can understanding english"
ERROR = "Unexpected Error Occurs. Message to Developer > "
NOT_CLEAR = "I'm sorry I didn't catch what you said. Could you repeat it, please"

In [None]:
# function to store conversation 
def inRange(index):
    return index >= 0 and index < len(convo) # true of false 

# setter 
def setQuestion(ques): 
    convo.append([str(ques)])
    
# function to store conversation 
def setResponse(res):
    convo[len(convo) - 1].append(str(res))

## ( int - index , string )
def setResponseWithNum(index, res):
    if inRange(index):
        convo[index].append(str(res))
        return True
    return False
    
## ( int - index , string )
def setResponseWithNum(index, res):
    if inRange(index):
        convo[index].append(str(res))
        return True
    return False 

# getter 
def getLatestQuestion():
    return convo[len(convo) - 1][0]    

def getLatestResponse():
    return convo[len(convo) - 1][1]

def getQuestionWithNum(index):
    if inRange(index):
        return convo[index][0]
    # out of index
    return None 

def getResponseWithNum(index):
    if inRange(index):
        return convo[index][1]
    # out of index
    return None 

# Example of using
# setQuestion("hello")
# setResponse("Hello how can i help u")
# [['hello', 'Hello how can i help u']]

In [None]:
# function to capture speech as input 
def capature_speech():
    try:
        with speech.Microphone() as mic:
            print("listening")
            audio = recognizer.listen(mic, timeout=3)
        return audio
    except speech.WaitTimeoutError as e:
        # within the time limit doesnot have any sound 
        raise speech.WaitTimeoutError(e)

# ******** Error to be handle ******** 
# WaitTimeoutError: listening timed out while waiting for phrase to start
# Define : no talking when listening 

In [None]:
# converting speech to text
def convert_speechToText(audio):
    text = ""
    try: 
        # converting 
        text = recognizer.recognize_google(audio)
        
        return text
    except speech.UnknownValueError as e:
        # unknown language / no speech / sound  
        raise speech.UnknownValueError(e)
    except speech.RequestError as e:
        raise speech.RequestError(e)

In [None]:
# load the large language model file
def load_model():
    return Llama(model_path="model/mistral-7b-instruct-v0.1.Q5_K_M.gguf")

LLM = load_model()

In [None]:
# only when the audio convert to text successful 
def generate_output(input):
    # generate a response (takes several seconds)
    output = LLM(input)
    text = output["choices"][0]["text"].strip()
    return text
    

In [None]:
def receive_inputVoice(): 
    text = ""
    
    try:
        # create a text prompt
        audio = capature_speech()
        text = convert_speechToText(audio)

        # store question 
        setQuestion(text)    
        
        return True

    except speech.WaitTimeoutError as e:
        # not sound when listening cause timeout
        setQuestion("Problem occurs when receiving input")
        setResponse(NOT_CLEAR)
    except speech.UnknownValueError as e:
        # cause when convert problem
        setQuestion("Problem occurs when receiving input")
        setResponse(NOT_UNDERSTAND)
    except speech.RequestError as e:
        # IDK
        setQuestion("Problem occurs when receiving input")
        setResponse(ERROR + format(e))
    except Exception as e: 
        setQuestion("Problem occurs when receiving input")
        setResponse(format(e))
    return False # input get unsuccessful - no continuos processing 


# Business suggest

- [The best post that I have ever seen](https://stackoverflow.com/questions/65199011/is-there-a-way-to-check-similarity-between-two-full-sentences-in-python)

https://www.geeksforgeeks.org/python-word-similarity-using-spacy/

https://huggingface.co/docs/transformers/model_doc/bert

In [None]:
# # Setup
# pip install spacy

# python -m spacy download en_core_web_md

## Check Similarity 

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

def get_similarity(sent1, sent2):
    doc1 = nlp(sent1)
    doc2 = nlp(sent2)
    print(doc1.similarity(doc2))
    return doc1.similarity(doc2)

def isSimilar(sent1, sent2):
    if get_similarity(sent1.lower(), sent2.lower()) > 0.70:
        return True
    return False

# print(get_similarity('can you suggest a few restaurant in', 'please help me to found a restaurant in Malaysia, that have high rating'))

In [None]:
# rule based for Hi and Hello

def checkRegard(input):
    if isSimilar(rule[0][0], input):
        setResponse(rule[0][1])
        return True
    elif isSimilar(rule[1][0], input):
        setResponse(rule[1][1])
        return True

## Convert convo into PDF

In [None]:
# pip install fpdf

In [None]:
from fpdf import FPDF
import datetime as date

# save FPDF() class into a 
# variable pdf
pdf = FPDF()
 
# Add a page
pdf.add_page()

# set style and size of font 
# that you want in the pdf
pdf.set_font("Arial", size = 15)

for qna in convo:
    pdf.cell(200, 10, txt = "You : " + qna[0][0], 
         ln = 1, align = 'L')
    pdf.cell(200, 10, txt = "Robot : " + qna[0][1], 
         ln = 1, align = 'L')

# save the pdf with name .pdf
date = date.datetime.now()
pdf.output(f"convo_{date.year}{date.month}{date.day}{date.hour}{date.second}{date.microsecond}.pdf")  

# Text to Speech

In [None]:
pip install pyttsx3 # sound output 

https://hackernoon.com/an-essential-python-text-to-speech-tutorial-using-the-pyttsx3-library

In [None]:
# text-to-speech setting 
import pyttsx3 as tts

def machineInitSetting():
    volume = machine.getProperty('volume')
    machine.setProperty('volume', volume+1.00)
    voices = machine.getProperty('voices')
    machine.setProperty('voice', voices[0].id)

def generate_sound(res):
    machine.say(res)
    machine.runAndWait()

machine = tts.init()
machineInitSetting()

## OpenVoice Problem
- cannot install torch
- https://www.youtube.com/watch?v=1ec-jOlxt_E&ab_channel=WingnutLabs
- https://www.youtube.com/watch?v=dLNN36hU06M&ab_channel=MG
- https://blog.unrealspeech.com/openvoice-completed-guide/
- https://github.com/myshell-ai/OpenVoice/blob/main/demo_part1.ipynb
- https://github.com/myshell-ai/OpenVoice/issues/98

# Data Cleaning

https://towardsdatascience.com/converting-yelp-dataset-to-csv-using-pandas-2a4c8f03bd88 

### Convert json to csv
https://www.squash.io/how-to-convert-json-to-csv-in-python/

https://www.w3schools.com/python/pandas/pandas_json.asp

https://www.analyticsvidhya.com/blog/2021/06/data-cleaning-using-pandas/

https://towardsdatascience.com/yelp-restaurant-recommendation-system-capstone-project-264fe7a7dea1

# Main

https://youtu.be/CkkjXTER2KE

In [None]:
import json
from difflib import get_close_matches

def load_knowledge_base(file_path: str) -> dict:
    with open(file_path, 'r') as file:
        data: dict = json.load(file)
    return data


def save_knowledge_base(file_path: str, data: dict):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=2)


def find_best_match(user_question: str, questions: list[str]) -> str | None:
    # 60 % similiar res
    matches: list = get_close_matches(user_question, questions, n=1, cutoff=0.6)
    return matches[0] if matches else None



def get_answer_for_question(question: str, knowledge_base: dict) -> str | None:
    for q in knowledge_base["questions"]:
        if q["question"] == question:
            return q["answer"]

def chat_bot():
    knowledge_base: dict = load_knowledge_base('knowledge_base.json')
        
    while True:
        user_input: str = input('You : ')
            
        if user_input.lower() == 'quit':
            break
        
        # search best match inside json file 
        best_match: str | None = find_best_match(user_input, [q['question'] for q in knowledge_base['questions']])
        
        if best_match:
            answer: str = get_answer_for_question(best_match, knowledge_base)
            print(f'Bot : {answer}')
            generate_sound(answer)
        else:
            # let machine to make a response 
            answer = generate_output(user_input)
            print(f'Bot : {answer}')
            generate_sound(answer)
                
if __name__ == '__main__':
    chat_bot()
        

## PyTorch

https://www.youtube.com/watch?v=wCuJncQsXxI&ab_channel=Anotherbarefooteel

In [None]:
import torch

## Keyword extract

https://stackoverflow.com/questions/27405942/best-way-to-extract-keywords-from-input-nlp-sentence

https://stackoverflow.com/questions/3788870/how-to-check-if-a-word-is-an-english-word-with-python

### get Location
https://stackoverflow.com/questions/56655312/retrieving-full-address-and-geocoding-based-on-place-store-name-and-city-stored

https://stackoverflow.com/questions/49518172/how-to-find-place-name-inside-a-sentence-using-nlp-and-python

## Fine Tuning

https://www.datacamp.com/tutorial/fine-tuning-large-language-models

https://huggingface.co/docs/datasets/en/loading

https://www.datacamp.com/tutorial/mistral-7b-tutorial

#### PyAudio
https://xn--llions-yua.jutge.org/upc-python-cookbook/signal-processing/audio-image.html

# Algorithm global setting

In [None]:
ques = "Happy Birthday"

## Algorithm 1 - GPT2

https://huggingface.co/docs/transformers/en/model_doc/gpt2

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def get_res_from_gpt2(ques: str):
    model_gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
    tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2")

    input_ids = tokenizer(ques, return_tensors="pt").input_ids

    gpt2_tokens = model_gpt2.generate(
        input_ids,
        do_sample=True,
        temperature=0.9,
        max_length=100,
    )

    gpt2_res = tokenizer.batch_decode(gpt2_tokens)[0]
    
    return gpt2_res

# print(type(gpt2_res))
# print(ques)
# print(gpt2_res)

## Algorithm 2 - Mistral 7b

In [None]:
# load the large language model file
from llama_cpp import Llama

def get_res_from_mistral(ques: str):
    LLM = Llama(model_path="model/mistral-7b-instruct-v0.1.Q5_K_M.gguf")
    
    # generate a response (takes several seconds)
    output = LLM(ques)
    
    return output["choices"][0]["text"]

# display the response
# print(ques)
# print(output["choices"][0]["text"])

## Algorithm 3 - DialoGPT

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

def get_res_from_dialo(ques: str):
    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(ques + tokenizer.eos_token, return_tensors='pt')

    bot_input_ids = new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    
    # pretty print last ouput tokens from bot
    return format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))

# print(chat_history_ids)
# print(chat_history_ids[:, bot_input_ids.shape[-1]:][0])
# print(bot_input_ids.shape[-1])
# print(bot_input_ids.shape)


# print(get_res_from_dialo(ques))

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
# model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

# # Let's chat for 5 lines
# for step in range(5):
#     # encode the new user input, add the eos_token and return a tensor in Pytorch
#     new_user_input_ids = tokenizer.encode(input(">> User : ") + tokenizer.eos_token, return_tensors='pt')

#     # append the new user input tokens to the chat history
#     bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

#     # generated a response while limiting the total chat history to 1000 tokens, 
#     chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

#     # pretty print last ouput tokens from bot
#     print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))