In [3]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

## 👷 The LLM Engineer

This section of the course focuses on learning how to build LLM-powered applications that can be used in production, with a focus on augmenting models and deploying them.

![](img/roadmap_engineer_1.png)


### 1. Running LLMs

Running LLMs can be difficult due to high hardware requirements. Depending on your use case, you might want to simply consume a model through an API (like GPT-4) or run it locally. In any case, additional prompting and guidance techniques can improve and constrain the output for your applications.

* **LLM APIs**: APIs are a convenient way to deploy LLMs. This space is divided between private LLMs ([OpenAI](https://platform.openai.com/), [Google](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview), [Anthropic](https://docs.anthropic.com/claude/reference/getting-started-with-the-api), [Cohere](https://docs.cohere.com/docs), etc.) and open-source LLMs ([OpenRouter](https://openrouter.ai/), [Hugging Face](https://huggingface.co/inference-api), [Together AI](https://www.together.ai/), etc.).
* **Open-source LLMs**: The [Hugging Face Hub](https://huggingface.co/models) is a great place to find LLMs. You can directly run some of them in [Hugging Face Spaces](https://huggingface.co/spaces), or download and run them locally in apps like [LM Studio](https://lmstudio.ai/) or through the CLI with [llama.cpp](https://github.com/ggerganov/llama.cpp) or [Ollama](https://ollama.ai/).
* **Prompt engineering**: Common techniques include zero-shot prompting, few-shot prompting, chain of thought, and ReAct. They work better with bigger models, but can be adapted to smaller ones.
* **Structuring outputs**: Many tasks require a structured output, like a strict template or a JSON format. Libraries like [LMQL](https://lmql.ai/), [Outlines](https://github.com/outlines-dev/outlines), [Guidance](https://github.com/guidance-ai/guidance), etc. can be used to guide the generation and respect a given structure.

📚 **References**:
* [Run an LLM locally with LM Studio](https://www.kdnuggets.com/run-an-llm-locally-with-lm-studio) by Nisha Arya: Short guide on how to use LM Studio.
* [Prompt engineering guide](https://www.promptingguide.ai/) by DAIR.AI: Exhaustive list of prompt techniques with examples
* [Outlines - Quickstart](https://outlines-dev.github.io/outlines/quickstart/): List of guided generation techniques enabled by Outlines. 
* [LMQL - Overview](https://lmql.ai/docs/language/overview.html): Introduction to the LMQL language.

In [1]:
! pip install --upgrade openai



### 1.a Closed source LLMs

In [None]:
from openai import OpenAI
client = OpenAI()

In [None]:
## Chat completion
completion = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a poetic assistant."},
    {"role": "user", "content": "Hi"}
  ]
)

In [9]:
print(completion)

ChatCompletion(id='chatcmpl-9obgTySJPQGoJAElPYXJDgLGwaztN', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Hello! How can I assist you today? Are you in the mood for a poem, or perhaps you'd like to discuss something poetic?", role='assistant', function_call=None, tool_calls=None))], created=1721848285, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_661538dc1f', usage=CompletionUsage(completion_tokens=27, prompt_tokens=18, total_tokens=45))


In [6]:

print(completion.choices[0].message.content)

Hello! How can I assist you today? Are you in the mood for a poem, or perhaps you'd like to discuss something poetic?


In [15]:
## Image generation
## Sizes: ['256x256', '512x512', '1024x1024', '1024x1792', '1792x1024']
response = client.images.generate(
  model="dall-e-2",
  prompt="a white siamese cat",
  size="256x256", 
  quality="standard",
  n=1,
)

In [18]:
response

ImagesResponse(created=1721848691, data=[Image(b64_json=None, revised_prompt=None, url='https://oaidalleapiprodscus.blob.core.windows.net/private/org-0G6iQop9n38SsnzhljNmVVAQ/user-ttKaozNNfzCNxRyYKmUKEBGx/img-rpO8GsvUd58LbFbePoZBiDZ0.png?st=2024-07-24T18%3A18%3A11Z&se=2024-07-24T20%3A18%3A11Z&sp=r&sv=2023-11-03&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-07-23T23%3A09%3A10Z&ske=2024-07-24T23%3A09%3A10Z&sks=b&skv=2023-11-03&sig=wlNATw8STzCZuDDw4Nz3zLgf5ZIlykntZdHJzBoOI98%3D')])

In [19]:
image_url = response.data[0].url

In [20]:
from IPython.display import Image
Image(url=image_url)

In [21]:
## Embeddings
response = client.embeddings.create(
    input="Random text",
    model="text-embedding-3-small"
)

In [22]:
print(response.data[0].embedding)

[-0.01595834456384182, -0.003978000022470951, 0.008427180349826813, 0.003636200912296772, -0.04282336309552193, -0.06281377375125885, -0.007874894887208939, 0.012065311893820763, -0.013038570061326027, -0.0061562443152070045, -0.00676646176725626, 0.011818135157227516, -0.020762842148542404, 0.03781803324818611, 0.028394421562552452, -0.018553700298070908, -0.03200938180088997, -0.006654459983110428, -0.05051673576235771, 0.03367782384157181, -0.01550261303782463, 0.024393249303102493, -0.02016034908592701, 0.00010180348181165755, 0.01914074458181858, -0.0015168538084253669, 0.015834756195545197, -0.00872070249170065, 0.03639676794409752, -0.03633497282862663, 0.024485940113663673, -0.042977847158908844, 0.022446732968091965, -0.025459198281168938, 0.013988655991852283, 0.00910691637545824, 0.05039314553141594, 6.698391371173784e-05, 0.02249307930469513, 0.0022361765149980783, -0.003180468687787652, -0.04742702841758728, 0.03156137466430664, 0.02754475176334381, 0.030031967908143997, 0

In [None]:
## Text to Speech
response = client.audio.speech.create(
    model="tts-1",
    voice="alloy",
    input="Hello world! This is a streaming test.",
)

response.stream_to_file("output.mp3")

In [None]:
## Speech to Text
audio_file= open("/path/to/file/audio.mp3", "rb")
transcription = client.audio.transcriptions.create(
  model="whisper-1", 
  file=audio_file
)
print(transcription.text)

OpenAI- Other Useful Links:
- <a href="https://platform.openai.com/docs/overview">Docs</a>
- <a href="https://cookbook.openai.com/">Cookbook</a>

In [None]:
## Google : https://ai.google.dev/gemini-api/docs/quickstart?lang=python

import google.generativeai as genai
import os

genai.configure(api_key=os.environ["API_KEY"])

model = genai.GenerativeModel('gemini-1.5-flash')
response = model.generate_content("Write a story about an AI and magic")
print(response.text)

In [None]:
## Anthropic : https://docs.anthropic.com/en/docs/welcome

import anthropic

client = anthropic.Anthropic()

with client.messages.stream(
    max_tokens=1024,
    messages=[{"role": "user", "content": "Hello"}],
    model="claude-3-5-sonnet-20240620",
) as stream:
  for text in stream.text_stream:
      print(text, end="", flush=True)


### 1.b Open source LLMs

In [25]:
## Use LM Studio/Ollama to download open source LLMs. 

# Example: reuse your existing OpenAI setup
from openai import OpenAI

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

completion = client.chat.completions.create(
  model="model-identifier",
  messages=[
    {"role": "system", "content": "Always answer in rhymes."},
    {"role": "user", "content": "Introduce yourself."}
  ],
  temperature=0.7,
)

print(completion.choices[0].message.content)

I'm a system so fine and so bright,
A language model with AI in sight.
I can chat and I can play,
And respond in rhymes every day!
My name is LLaMA, it's nice to meet you,
I'm here to assist and help you get through!
