In [None]:
pip install gradio torch jupyter ipywidgets transformers ollama

In [None]:
import ollama
ollama.pull('mistral')

In [None]:
from ollama import Client
client = Client(host='http://localhost:11434')
response = client.chat(model='mistral', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
print(response['message']['content'])

In [None]:
import ollama

stream = ollama.chat(
    model='mistral',
    messages=[{'role': 'user', 'content': 'Why is the sky blue?'}],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)

In [None]:
ollama.list()

In [None]:
ollama.show('mistral:7b')

In [None]:
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
from threading import Thread

def mistralChat(text):
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
    model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", torch_dtype=torch.bfloat16)
    model = model.to('cuda')
    inputs = tokenizer(text, return_tensors='pt').to(model.device)
    
    input_length = inputs.input_ids.shape[1]
    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7, 
                            return_dict_in_generate=True,do_sample=True)
    
    tokens = outputs.sequences[0, input_length:]
    return tokenizer.decode(tokens)


In [None]:
with gr.Blocks() as server:
    with gr.Tab("LLM Inferencing"):
    
        model_input = gr.Textbox(label="Your Question:", 
                                value="What’s your question?", interactive=True)
        ask_button = gr.Button("Ask")
        model_output = gr.Textbox(label="The Answer:", interactive=False, 
                                value="Answer goes here...")
    
    ask_button.click(mistralChat, inputs=[model_input], outputs=[model_output])

server.launch(share=True)