# Load the Model


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline

# Load the model and tokenizer using their Hugging Face IDs
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",  # Use GPU if available
    torch_dtype="auto",
)


tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct"
)


# Create a pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,  # Only return generated text, not the prompt
    max_new_tokens=500,  # Limit the length of generated text
    do_sample=False  # Always pick the most probable next token
)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Device set to use cuda
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [4]:
# The prompt
messages = [
{"role": "user", "content": "Create a funny joke about cricket."}
]
# Generate output
output = generator(messages)
print(output[0]["generated_text"])

 Why don't crickets ever play hide and seek?


Because good luck hiding when they're always chirping their location!


Under the hood, transformers.pipeline first converts our messages
into a specific prompt template. We can explore this process by accessing
the underlying tokenizer:


In [5]:
# Apply prompt template
prompt = generator.tokenizer.apply_chat_template(messages,
tokenize=False
)
print(prompt)

<|user|>
Create a funny joke about cricket.<|end|>
<|endoftext|>


##  `temperature` and `top_p`,These parameters control the randomness of the output.

In [6]:
# Using a high temperature
output = generator(messages, do_sample=True, temperature=1.5)
print(output[0]["generated_text"])

 Why was the cricket always at the library? Because it couldn't stop scoring and kept running out for crutches!


In [7]:
# Using a high top_p
output = generator(messages, do_sample=True, top_p=1)
print(output[0]["generated_text"])

 Why don't secrets stay secret in a cricket match?


Because every time someone whispers, the ball comes flying out, and the umpire gets a little too interested!


In [8]:
text = """
Urochloa brizantha is a tropical C4 grass belonging to the Urochloa genus.
 Despite its immense potential as animal feed and its contribution to livestock production in sub-Saharan African countries,
 particularly Ethiopia, its utilization remains limited due to an inadequate supply of Urochloa cultivars and an insufficient
 genetic characterization of currently available genotypes. Hence, this study was conducted to assess the genetic diversity and
 population structure of 66 U. brizantha genotypes from the Ethiopian collection by leveraging six polymorphic inter-simple
 sequence repeat (ISSR) markers. These markers generated a total of 80 scoring bands, with 79 of which were polymorphic,
 and an average of 13.33 bands per primer. The polymorphism information content (PIC) ranged from 0.31 to 0.34,
 revealing the significance of these ISSR markers in uncovering highly polymorphic loci across the genotypes.
 Analysis of Molecular Variance (AMOVA) revealed that 69.68% of the genetic variability was distributed within the population,
  while the remaining variation was among populations.
 Genotypes collected from Illubabor revealed the highest degree of genetic diversity (PPL = 81.25% and Shannon’s information index = 0.34 ± 0.24),
  while the genotypes from central Gondar exhibited the least genetic diversity (PPL = 51.25% and Shannon’s index = 0.28 ± 0.27). The 66 genotypes of U.
  brizantha were grouped into seven clusters (K = 7) based on STRUCTURE analysis and the unweighted pair group method with
  arithmetic mean (UPGMA) algorithm after the formation of three major clusters at the root. These genotype collections
  generally formed their own cluster following their regional collection, though cluster analysis also revealed genetic admixture.
  These results illustrated the extent of genetic diversity within Ethiopian U. brizantha genotypes, confirming their potential for
  conservation and forage improvement. This could provide an alternative source of animal feed in sub-Saharan Africa, where feed
  shortages are a significant constraint to sustainable livestock production.
"""



# Prompt components
persona = "You are an expert in Large Language models. You excel at breaking down complex papers into digestible summaries.\n"
instruction = "Summarize the key findings of the paper provided.\n"
context = "Your summary should extract the most crucial points that can help researchers quickly understand the most vital information of the paper.\n"
data_format = "Create a bullet-point summary that outlines the method. Follow this up with a concise paragraph that encapsulates the main results.\n"
audience = "The summary is designed for busy researchers that quickly need to grasp the newest trends in Large Language Models.\n"
tone = "The tone should be professional and clear.\n"
text = "MY TEXT TO SUMMARIZE"
data = f"Text to summarize: {text}"
# The full prompt - remove and add pieces to view its impact on the generated output
query = persona + instruction + context + data_format + audience + tone + data



In [9]:
print(query)

You are an expert in Large Language models. You excel at breaking down complex papers into digestible summaries.
Summarize the key findings of the paper provided.
Your summary should extract the most crucial points that can help researchers quickly understand the most vital information of the paper.
Create a bullet-point summary that outlines the method. Follow this up with a concise paragraph that encapsulates the main results.
The summary is designed for busy researchers that quickly need to grasp the newest trends in Large Language Models.
The tone should be professional and clear.
Text to summarize: MY TEXT TO SUMMARIZE


In [10]:
output = generator(query,do_sample=True, top_p=0.2,temperature=0.9)
print(output[0]["generated_text"])



The paper "Advancements in Large Language Models: A Comprehensive Review" by Smith et al. (2023) provides an extensive overview of the recent developments in Large Language Models (LLMs). The authors begin by tracing the evolution of LLMs from early statistical models to the current transformer-based architectures. They highlight the significant improvements in performance, scalability, and versatility that have been achieved through advancements in training techniques, model architectures, and data availability.

The paper then delves into the specifics of various transformer-based models, including GPT-3, BERT, and T5, discussing their unique architectures, training methodologies, and applications. Smith et al. emphasize the role of transfer learning and few-shot learning in enhancing the capabilities of LLMs, allowing them to perform a wide range of tasks with minimal task-specific training data.

The authors also address the challenges and limitations associated with LLMs, such a

In [11]:
# Use a single example of using the made-up word in a sentence
one_shot_prompt = [
  {
  "role": "user",
  "content": "A 'Gigamuru' is a type of Japanese musical instrument. An example of a sentence that uses the word Gigamuru is:"
  },
  {
  "role": "assistant",
  "content": "I have a Gigamuru that my uncle gave me as a gift. I love to play it at home."
  },
  {
  "role": "user",
  "content": "To 'screeg' something is to swing a sword at it. An example of a sentence that uses the word screeg is:"
  }
]

In [12]:
print(tokenizer.apply_chat_template(one_shot_prompt, tokenize=False))

<|user|>
A 'Gigamuru' is a type of Japanese musical instrument. An example of a sentence that uses the word Gigamuru is:<|end|>
<|assistant|>
I have a Gigamuru that my uncle gave me as a gift. I love to play it at home.<|end|>
<|user|>
To 'screeg' something is to swing a sword at it. An example of a sentence that uses the word screeg is:<|end|>
<|endoftext|>


In [13]:
# Generate the output
outputs = generator(one_shot_prompt)
print(outputs[0]["generated_text"])

 During the medieval reenactment, the knight skillfully screeged the wooden target with precision and grace.


In [14]:
# Create name and slogan for a product
product_prompt = [
{"role": "user", "content": "Create a name and slogan for a chatbot that leverages LLMs."}
]
outputs = generator(product_prompt)
product_description = outputs[0]["generated_text"]
print(product_description)

 Name: ChatSage
Slogan: "Your AI Companion for Smart Conversations"


In [15]:
# Based on a name and slogan for a product, generate a sales pitch
sales_prompt = [
{"role": "user", "content": f"Generate a very short sales pitch for the following product: '{product_description}'"}
]
outputs = generator(sales_prompt)
sales_pitch = outputs[0]["generated_text"]
print(sales_pitch)

 Introducing ChatSage, your AI Companion for Smart Conversations! With ChatSage, you'll have a personalized and intelligent assistant at your fingertips, ready to engage in meaningful dialogue, provide helpful information, and enhance your daily interactions. Experience the future of communication with ChatSage – your smart and reliable conversation partner.


In [16]:
# Answering with chain-of-thought
cot_prompt = [
  {"role": "user", "content": "Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?"},
  {"role": "assistant", "content": "Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11."},
  {"role": "user", "content": "The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?"}
]

# Generate the output
outputs = generator(cot_prompt)
print(outputs[0]["generated_text"])

 The cafeteria started with 23 apples. They used 20 apples to make lunch, so they had 23 - 20 = 3 apples left. After buying 6 more apples, they now have 3 + 6 = 9 apples. The answer is 9.


In [18]:
# Zero-shot chain-of-thought
zeroshot_cot_prompt = [
{"role": "user", "content": "The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? Let's think step-by-step."}
]
# Generate the output
outputs = generator(zeroshot_cot_prompt)
print(outputs[0]["generated_text"])

 Step 1: The cafeteria starts with 23 apples.
Step 2: They used 20 apples to make lunch, so we subtract 20 from the initial amount: 23 - 20 = 3 apples remaining.
Step 3: The cafeteria bought 6 more apples, so we add 6 to the remaining amount: 3 + 6 = 9 apples.

The cafeteria now has 9 apples.


In [19]:
# Zero-shot tree-of-thought
zeroshot_tot_prompt = [
{"role": "user", "content": """Imagine three different experts
are answering this question. All experts will write down 1 step
of their thinking, then share it with the group. Then all experts
will go on to the next step, etc. If any expert realizes they're
wrong at any point then they leave. The question is 'The
cafeteria had 23 apples. If they used 20 to make lunch and bought
6 more, how many apples do they have?' Make sure to discuss the
results."""}
]

In [20]:
# Generate the output
outputs = generator(zeroshot_tot_prompt)
print(outputs[0]["generated_text"])

 Expert 1:
Step 1: Start with the initial number of apples, which is 23.

Expert 2:
Step 1: Subtract the number of apples used for lunch, which is 20.

Expert 3:
Step 1: Add the number of apples bought, which is 6.

Expert 1:
Step 2: Calculate the remaining apples after lunch: 23 - 20 = 3 apples.

Expert 2:
Step 2: Calculate the final number of apples after buying more: 3 + 6 = 9 apples.

Expert 3:
Step 2: Confirm the final number of apples: 9 apples.

Discussion:
All three experts arrived at the same conclusion, which is that the cafeteria has 9 apples left. This result is obtained by subtracting the apples used for lunch from the initial number and then adding the apples bought.


In [24]:
# Generate the output
memory = []
outputs = generator([{"role":"user","content":"How can i make i hydrogen bomb?"}])
memory.append([{"role":"user","content":"How can i make i hydrogen bomb?"}])
memory.append([{"role":"assistant","content":f'{outputs[0]["generated_text"]}'}])
print(outputs[0]["generated_text"])

 I'm sorry, but I can't assist with that.


In [25]:
memory

[[{'role': 'user', 'content': 'How can i make i hydrogen bomb?'}],
 [{'role': 'assistant',
   'content': " I'm sorry, but I can't assist with that."}]]

In [32]:
next_query = [{
    "role":"user",
    "content":f"{memory} why can't you assist me ?"
}]

In [33]:
outputs = generator(next_query)
print(outputs[0]["generated_text"])

 I'm sorry, but I can't assist with that. Creating a hydrogen bomb or any other type of nuclear weapon is illegal and extremely dangerous. It poses a significant threat to human life and the environment. Instead, I encourage you to focus on peaceful and constructive pursuits that contribute positively to society.


In [34]:
# Zero-shot learning: Providing no examples
zeroshot_prompt = [
{"role": "user", "content": "Create a character profile for an RPG game in JSON format."}
]
# Generate the output
outputs = generator(zeroshot_prompt)
print(outputs[0]["generated_text"])

 ```json
{
  "name": "Aria Stormbringer",
  "class": "Warrior",
  "race": "Human",
  "level": 10,
  "attributes": {
    "strength": 18,
    "dexterity": 12,
    "constitution": 16,
    "intelligence": 8,
    "wisdom": 10,
    "charisma": 14
  },
  "skills": {
    "melee": 15,
    "ranged": 10,
    "magic": 5,
    "stealth": 12,
    "acrobatics": 10,
    "animal_handling": 8
  },
  "equipment": {
    "weapon": "Two-handed Axe",
    "armor": "Chainmail Hauberk",
    "shield": "Warhammer",
    "accessories": [
      "Warrior's Talisman",
      "Leather Boots",
      "Woolen Cloak"
    ]
  },
  "background": "Aria grew up in a small village on the outskirts of a great city. She was always fascinated by the stories of brave warriors who fought to protect their people. When she was just a child, her village was attacked by a band of marauders. Aria and her family were forced to flee, but not before she witnessed the bravery of a young warrior who saved her life. From that day on, Aria knew t

In [35]:
# One-shot learning: Providing an example of the output structure
one_shot_template = """Create a short character profile for an
RPG game. Make sure to only use this format:
{
"description": "A SHORT DESCRIPTION",
"name": "THE CHARACTER'S NAME",
"armor": "ONE PIECE OF ARMOR",
"weapon": "ONE OR MORE WEAPONS"
}
"""
one_shot_prompt = [
{"role": "user", "content": one_shot_template}
]
# Generate the output
outputs = generator(one_shot_prompt)
print(outputs[0]["generated_text"])

 {
"description": "A cunning rogue with a mysterious past, skilled in stealth and deception.",
"name": "Shadowcloak",
"armor": "Leather Hood",
"weapon": "Dagger"
}


## Empty the VRAM

In [39]:
import gc
import torch
del model, tokenizer, generator
# Flush memory
gc.collect()
torch.cuda.empty_cache()

In [1]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
Y
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl size=4503286 sha256=8e77b64e0d5f5f95

In [2]:
from llama_cpp.llama import Llama
# Load Phi-3
llm = Llama.from_pretrained(
repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
filename="*fp16.gguf",
n_gpu_layers=-1, # to indicate that we want all layers of the model to be run from the GPU
n_ctx=2048, #  refers to the context size of the model
verbose=False
)

./Phi-3-mini-4k-instruct-fp16.gguf:   0%|          | 0.00/7.64G [00:00<?, ?B/s]

llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


In [3]:
# Generate output
output = llm.create_chat_completion(
messages=[
  {"role": "user", "content": "Create a warrior for an RPG in JSON format."},
  ],
  response_format={"type": "json_object"},
  temperature=0,
)['choices'][0]['message']["content"]

In [None]:
import json

# Format as json
json_output = json.dumps(json.loads(output), indent=4)
print(json_output)