## Notebook for inference of user queries 

### Run all the cells (Requires atleast 18 GB of GPU Memory), paste your question in the query variable and select the kind of question you are interested to ask. (Generic OR Code based)   

In [2]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login


from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)

from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)

from transformers import AutoTokenizer, AutoModelForCausalLM

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Loading the generic model (finetuned on Nvidia tech blogs)

In [3]:
# PEFT_MODEL_gen = "llama2_qat_r32_a32_gen3"
PEFT_MODEL_gen = "FALCON7B_r32_a64_gen_tot"

config_gen = PeftConfig.from_pretrained(PEFT_MODEL_gen)

bnb_config = BitsAndBytesConfig(
    load_in_8bit = True
)


model_gen = AutoModelForCausalLM.from_pretrained(
    config_gen.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    # device_map=device_map,
    device_map="auto",
    torch_dtype=torch.float32,
    trust_remote_code=True,
    # use_auth_token = True
)

tokenizer_gen = AutoTokenizer.from_pretrained(config_gen.base_model_name_or_path)
tokenizer_gen.pad_token = tokenizer_gen.eos_token

model_gen = PeftModel.from_pretrained(model_gen, PEFT_MODEL_gen)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Setting up the generation parameters 

In [4]:
generation_config_gen = model_gen.generation_config
generation_config_gen.max_new_tokens = 100
generation_config_gen.temperature = 0.005
generation_config_gen.top_p = 1
generation_config_gen.num_return_sequences = 1
generation_config_gen.pad_token_id = tokenizer_gen.eos_token_id
generation_config_gen.eos_token_id = tokenizer_gen.eos_token_id
generation_config_gen.repetition_penalty = 2.5

## Loading the code model (Fine-tuned on stackover flow's queries about Nvidia SDKs)

In [5]:
PEFT_MODEL_code = "pup_gorilla_model"

config_code = PeftConfig.from_pretrained(PEFT_MODEL_code)


bnb_config = BitsAndBytesConfig(
    load_in_8bit = True
)

model_code = AutoModelForCausalLM.from_pretrained(
    config_code.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    # device_map=device_map,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    # use_auth_token = True
)

tokenizer_code = AutoTokenizer.from_pretrained(config_code.base_model_name_or_path)
tokenizer_code.pad_token = tokenizer_code.eos_token

model_code = PeftModel.from_pretrained(model_code, PEFT_MODEL_code)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Setting up the generation parameters 

In [6]:
generation_config_code = model_code.generation_config
generation_config_code.max_new_tokens = 100
generation_config_code.temperature = 0.005
generation_config_code.top_p = 1
generation_config_code.num_return_sequences = 1
generation_config_code.pad_token_id = tokenizer_code.eos_token_id
generation_config_code.eos_token_id = tokenizer_code.eos_token_id
generation_config_code.repetition_penalty = 2.5

In [7]:
%%time
device = "cuda:0"


# question = '''How can I install the
# NVIDIA CUDA Toolkit on
# Windows?'''

question = '''What is the NVIDIA CUDA Toolkit?'''

ans_true = '''The CUDA Toolkit from NVIDIA provides everything
you need to develop GPU-accelerated applications.
The CUDA Toolkit includes GPU-accelerated libraries,
a
compiler, development tools and the CUDA runtime.'''


# question = '''What are the key features of the NVIDIA TensorRT?'''
# ans_true = '''NVIDIA TensorRT, an SDK for high-performance
# deep learning inference, includes a deep learning
# inference optimizer and runtime that delivers low
# latency and high throughput for inference
# applications.Some key features of TensorRT include:
# - NVIDIA TensorRT-based applications perform up
# to 36X faster than CPU-only platforms during
# inference, enabling you to optimize neural network
# models
# - TensorRT, built on the NVIDIA CUDA®
# parallel programming model, enables you to optimize
# inference using techniques such as quantization, layer
# and tensor fusion, kernel tuning, and others on
# NVIDIA GPUs.'''


question = '''What is the difference between
NVIDIA's BioMegatron and Megatron
530B LLM?'''

ans_true = '''BioMegatron focuses specifically on biomedical NLP
tasks and has been trained on relevant biomedical data,
Megatron 530B LLM is a more general-purpose
language model trained on a wide variety of text from
different domains. The choice between the two models
depends on the specific requirements and domain of your
NLP task.'''


prompt = f"""
Answer the following question about Nvidea's product and services in detail: \n{question}
\nAnswer:-
""".strip()



# prompt = """
# Answer the following question about Nvidea's product and services: What is NVIDEA's Omniverse? What is the purpose of it? Explain breifly \n\n
# \n\nAnswer:-:
# """.strip()

encoding = tokenizer(prompt, return_tensors="pt").to(device)

NameError: name 'tokenizer' is not defined

In [8]:
def get_reponse(query, query_type):
    
    device = "cuda:0"
    
    prompt = f"""
        Answer the following question about Nvidia's product and services in detail: \n{query}
        \nAnswer:-
        """.strip()
    
    if query_type == "code":
        
        print("Code")

        
        encoding = tokenizer_code(prompt, return_tensors="pt").to(device)
        
        with torch.inference_mode():
            outputs = model_code.generate(
              input_ids = encoding.input_ids,
              attention_mask = encoding.attention_mask,
              generation_config = generation_config_code
          )
            
        response = tokenizer_code.decode(outputs[0], skip_special_tokens=True)
        
        
        return response
        


    elif query_type == "generic":
        
        print("Generic")
        
        encoding = tokenizer_gen(prompt, return_tensors="pt").to(device)
        
        with torch.inference_mode():
            outputs = model_gen.generate(
              input_ids = encoding.input_ids,
              attention_mask = encoding.attention_mask,
              generation_config = generation_config_gen
          )
            
        response = tokenizer_gen.decode(outputs[0], skip_special_tokens=True)
        
        
        return response
        
        
        
    else:
        print('''Please choose query_type to be one of these :- {"code", "generic"}''')
        return 
            
            
                    

In [12]:
generation_config_gen = model_gen.generation_config
generation_config_gen.max_new_tokens = 100
generation_config_gen.temperature = 0.005
generation_config_gen.top_p = 1
generation_config_gen.num_return_sequences = 1
generation_config_gen.pad_token_id = tokenizer_gen.eos_token_id
generation_config_gen.eos_token_id = tokenizer_gen.eos_token_id
generation_config_gen.repetition_penalty = 2.5


# op = get_reponse(query = ''' What is Nvidia RAPIDS? what is it used for? '''
            
#             , query_type = "generic")


query = "How to install NVIDIA on vmware?"

op = get_reponse(query = query
            
            , query_type = "generic")



Generic


In [10]:
print(op)

Answer the following question about Nvidia's product and services in detail: 
How to install NVIDIA on vmware?
        
Answer:-: To install NVIDIA drivers for Linux guests, you must first enable virtualization support in your VMware Workstation or Fusion settings. Then download and run the installer package from nvidia.com/drivers. After installation, reboot your virtual machine. Once it restarts, log into GNOME Shell as root with sudo su -c sh -e '/usr/bin/nvdia_installer'. This will launch a graphical user interface that guides you through installing CUDA Toolkit components. You can also use this method to update existing installations of CUDA software packages.


In [11]:
%%time

query = "What is the NVIDIA CUDA Toolkit?"

op = get_reponse(query = query
            
            , query_type = "generic")

print(op)

Generic
Answer the following question about Nvidia's product and services in detail: 
What is the NVIDIA CUDA Toolkit?
        
Answer:-: The NVIDIA CUDA Toolkit provides a unified, efficient programming environment for developing GPU-accelerated applications. It includes libraries, compilers, sample code, documentation, tools, drivers, performance tuning guides, profiling utilities, tutorials, support forums, and more.
CPU times: user 5.48 s, sys: 2.93 ms, total: 5.48 s
Wall time: 5.48 s


In [13]:
%%time

query = "What are the key features of the NVIDIA TensorRT?"

op = get_reponse(query = query
            
            , query_type = "generic")

print(op)

Generic
Answer the following question about Nvidia's product and services in detail: 
What are the key features of the NVIDIA TensorRT?
        
Answer:-: The NVIDIA® TensorRT™ inference engine is a high-performance, production ready library that accelerates deep learning inferencing on GPUs. It provides optimized implementations for popular neural networks such as ResNet50V2, MobileNetsv2/3, Inception v4, XGBoost, DensePose, etc., with support for multiple backends including CUDA, OpenCL, DirectX Compute Shaders, Intel SGDR, Microsoft CUDNN, Arm NNAPI,
CPU times: user 10.3 s, sys: 2.66 ms, total: 10.3 s
Wall time: 10.3 s


In [14]:
%%time

query = "What is the nvidia-smi command used for?"

op = get_reponse(query = query
            , query_type = "generic")

print(op)

Generic
Answer the following question about Nvidia's product and services in detail: 
What is the nvidia-smi command used for?
        
Answer:-: The nvidia-smi command displays information on GPUs, memory usage, fan speeds, power consumption, temperatures, clocks, errors, events, etc. It can be run from a terminal or as an interactive session via SSH to view real-time statistics.
CPU times: user 5.52 s, sys: 1.19 ms, total: 5.53 s
Wall time: 5.53 s


In [15]:
%%time

query = "What is the nvidia-smi command used for?"

op = get_reponse(query = query
            , query_type = "code")

print(op)

Code
Answer the following question about Nvidia's product and services in detail: 
What is the nvidia-smi command used for?
        
Answer:-: The NVIDIA System Management Interface, or nvidia-smi command, provides information on GPU utilization, temperature, power consumption, memory usage, fan speed, clock rate, core voltage, PCIe lane allocation, CUDA device availability, installed driver version, display output configuration, supported OpenCL platforms, and more. It can also be used to monitor real-time performance metrics of a running application with an interactive interface that displays graphs based on collected data.
CPU times: user 9.49 s, sys: 9.78 ms, total: 9.5 s
Wall time: 9.5 s
