# Setup Environment
The following code loads the environment variables required to run this notebook.


In [None]:
FILE="GenAI Lab 3"

! pip install -qqq git+https://github.com/elastic/notebook-workshop-loader.git@main
from notebookworkshoploader import loader
import os
from dotenv import load_dotenv

if os.path.isfile("../env"):
    load_dotenv("../env", override=True)
    print('Successfully loaded environment variables from local env file')
else:
    loader.load_remote_env(file=FILE, env_url="https://notebook-workshop-api-voldmqr2bq-uc.a.run.app")

In [None]:
!pip install -qqq langchain==0.1.3 sentence-transformers==2.2.2 beautifulsoup4==4.11.2
!pip install -qqq tiktoken==0.5.2 cohere==4.38 openai==1.3.9
!pip install -qqq matplotlib==3.7.1 scikit-learn==1.2.2 scipy==1.11.4
!pip install -qqq elasticsearch==8.12.0 inquirer==3.2.1

from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
import os
from elasticsearch import Elasticsearch
if 'ELASTIC_CLOUD_ID' in os.environ:
  es = Elasticsearch(
    cloud_id=os.environ['ELASTIC_CLOUD_ID'],
    api_key=(os.environ['ELASTIC_APIKEY_ID'], os.environ['ELASTIC_APIKEY_SECRET']),
    request_timeout=30
  )
elif 'ELASTIC_URL' in os.environ:
  es = Elasticsearch(
    os.environ['ELASTIC_URL'],
    api_key=(os.environ['ELASTIC_APIKEY_ID'], os.environ['ELASTIC_APIKEY_SECRET']),
    request_timeout=30
  )
else:
  print("env needs to set either ELASTIC_CLOUD_ID or ELASTIC_URL")

In [None]:
import os, secrets, requests
import openai
from openai import OpenAI
from requests.auth import HTTPBasicAuth

#if using the Elastic AI proxy, then generate the correct API key
if os.environ['ELASTIC_PROXY'] == "True":

    if "OPENAI_API_TYPE" in os.environ: del os.environ["OPENAI_API_TYPE"]

    #generate and share "your" unique hash
    os.environ['USER_HASH'] = secrets.token_hex(nbytes=6)
    print(f"Your unique user hash is: {os.environ['USER_HASH']}")

    #get the current API key and combine with your hash
    os.environ['OPENAI_API_KEY'] = f"{os.environ['OPENAI_API_KEY']} {os.environ['USER_HASH']}"
else:
    openai.api_type = os.environ['OPENAI_API_TYPE']
    openai.api_version = os.environ['OPENAI_API_VERSION']

openai.api_key = os.environ['OPENAI_API_KEY']
openai.api_base = os.environ['OPENAI_API_BASE']
openai.default_model = os.environ['OPENAI_API_ENGINE']

## Step 1 : Simple vectorization using a Vector Embedding model installed to Elasticsearch
[Model Card - msmarco-MiniLM-L-12-v3](https://huggingface.co/sentence-transformers/msmarco-MiniLM-L-12-v3). - note this model has a 512 token limit

In [None]:
es_model_id = 'sentence-transformers__msmarco-minilm-l-12-v3'

## use REST call to Elastic to generate Vector Embedding, assumes model is already installed
def sentence_to_vector_es(chunk, es_model_id=es_model_id):
  docs =  [{"text_field": chunk}]
  chunk_vector = es.ml.infer_trained_model(model_id=es_model_id, docs=docs, )
  return chunk_vector['inference_results'][0]['predicted_value']


chunk = "The quick brown fox jumped over the lazy dog"
es_generated_vector = sentence_to_vector_es(chunk)
print(f"Dimensions: {len(es_generated_vector)}, \nVector preview: {es_generated_vector[:5]+ ['...']}")

## Step 2: Vectoring Data using a local E5 model and Sentence Transformer

[Model card E5-large-v2](https://huggingface.co/intfloat/e5-large-v2)

In [None]:
# from sentence_transformers import SentenceTransformer
e5_model = SentenceTransformer('intfloat/e5-large-v2')
input_texts = [
    'query: how much protein should a female human eat',
    'query: summit define',
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."
]
embeddings = e5_model.encode(input_texts, normalize_embeddings=True)
close=" ...]"
print(f"Dimensions: {len(embeddings[0])}, \nVector preview: {str(embeddings[0][:5])[:-1]+close}")

## Step 3: Doing the same thing but with the LangChain Utility libraries

In [None]:
# from langchain_community.embeddings import HuggingFaceEmbeddings
langchain_e5_embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")
input_texts = [
    'query: how much protein should a female human eat',
    'query: summit define',
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."
]
embeddings = langchain_e5_embeddings.embed_documents(input_texts)
close=", ...]"
print(f"Dimensions: {len(embeddings[0])}, \nVector preview: {str(embeddings[0][:5])[:-1]+close}")

## Step 4: Let's create a simplified graph of generated Embeddings

Principal Component analysis can be used to simplify higher dimesions into a 2d plot.


In [None]:
# fetch the model and load it
word_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model dimensions:", word_model.get_sentence_embedding_dimension())

# generate embeddings
embeddings_for_cat = word_model.encode("cat")
embeddings_for_kitten = word_model.encode("kitten")
embeddings_for_dog = word_model.encode("dog")
embeddings_for_puppy = word_model.encode("puppy")
embeddings_for_lawnmower = word_model.encode("lawnmower")

# let's see what we got, though truncate the embeddings to just the first 5 dimensions
print(f"embedding dimensions: {embeddings_for_cat.size}")
print(f"cat: {list(embeddings_for_cat)[:5] + ['...']}")
print(f"dog: {list(embeddings_for_dog)[:5] + ['...']}")

In [None]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.decomposition import PCA

# wrap embeddings with a DataFrame
df = pd.DataFrame(
    [
      [embeddings_for_cat],
      [embeddings_for_kitten],
      [embeddings_for_dog],
      [embeddings_for_puppy],
      [embeddings_for_lawnmower],
    ],
    index=["cat", "kitten", "dog", "puppy", "lawnmower"], columns=["embeddings"]
)

# Initialize the PCA reducer to convert embeddings into arrays of length of 2
reducer = PCA(n_components=2)

# Reduce the embeddings, store them in a new dataframe column and display their shape
df["reduced"] = reducer.fit_transform(np.stack(df["embeddings"])).tolist()


def scatterplot(
    data: pd.DataFrame,
    tooltips=False,
    labels=False,
    width=800,
    height=600,
) -> alt.Chart:
    base_chart = (
        alt.Chart(data)
        .encode(
            alt.X("x", scale=alt.Scale(zero=False)),
            alt.Y("y", scale=alt.Scale(zero=False)),
        )
        .properties(width=width, height=height)
    )

    if tooltips:
        base_chart = base_chart.encode(alt.Tooltip(["text"]))

    circles = base_chart.mark_circle(
        size=200, color="crimson", stroke="white", strokeWidth=1
    )

    if labels:
        labels = base_chart.mark_text(
            fontSize=13,
            align="left",
            baseline="bottom",
            dx=5,
        ).encode(text="text")
        chart = circles + labels
    else:
        chart = circles

    return chart

source = pd.DataFrame(
    {
        "text": df.index,
        "x": df["reduced"].apply(lambda x: x[0]).to_list(),
        "y": df["reduced"].apply(lambda x: x[1]).to_list(),
    }
)

scatterplot(source, labels=True,  width=400, height=300)

## Step 5 - using the more advanced e5 model, see that questions can be matched with answers

In [None]:
## using e5_model previously loaded
input_texts = [
    'query: how much protein should a female human eat',
    'query: summit define',
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
    "passage: I am the very model of a modern Major-General / I've information vegetable, animal, and mineral / I know the kings of England, and I quote the fights historical / From Marathon to Waterloo, in order categorical / I'm very well acquainted, too, with matters mathematical",
    "passage: When, in the course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume, among the powers of the earth",
    "passage: It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",
]
embeddings = e5_model.encode(input_texts, normalize_embeddings=True)


# let's see what we got, though truncate the embeddings to just the first 5 dimensions
print(f"embedding dimensions: {embeddings[0].size}")
print(f"first query: {list(embeddings[0])[:3] + ['...']}")


# wrap embeddings with a DataFrame
df = pd.DataFrame(
    [
      [embeddings[0]],
      [embeddings[1]],
      [embeddings[2]],
      [embeddings[3]],
      [embeddings[4]],
      [embeddings[5]],
      [embeddings[6]],
    ],
    index=[
        "q: protein",
        "q: summit",
        "p: protein guide",
        "p: summit def",
        "p: penzanse",
        "p: dec of ind",
        "p: austen"
        ], columns=["embeddings"]
)

# Initialize the PCA reducer to convert embeddings into arrays of length of 2
reducer = PCA(n_components=2)

# Reduce the embeddings, store them in a new dataframe column and display their shape
df["reduced"] = reducer.fit_transform(np.stack(df["embeddings"])).tolist()

source = pd.DataFrame(
    {
        "text": df.index,
        "x": df["reduced"].apply(lambda x: x[0]).to_list(),
        "y": df["reduced"].apply(lambda x: x[1]).to_list(),
    }
)

scatterplot(source, labels=True,  width=400, height=300)

## Step 6 : calculate the actual distance in 1024 dimensional space

In [None]:
from scipy.spatial import distance

passages = [
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
    "passage: I am the very model of a modern Major-General / I've information vegetable, animal, and mineral / I know the kings of England, and I quote the fights historical / From Marathon to Waterloo, in order categorical / I'm very well acquainted, too, with matters mathematical",
    "passage: When, in the course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume, among the powers of the earth",
    "passage: It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",
]

def chunks_by_distance(passages, query_text, model):
  embeddings = model.encode(passages, normalize_embeddings=True)
  query_embedding = model.encode(query_text, normalize_embeddings=True)
  distances = []
  for index, passage in enumerate(passages):
    cos_distance = distance.cosine(embeddings[index], query_embedding)
    distances.append((passage, cos_distance))
  sorted_passages = sorted(distances, key=lambda x: x[1])

  return sorted_passages

protein_query = 'query: how much protein should a female human eat'
sorted_passages = chunks_by_distance(passages, protein_query, e5_model)

for passage, dist in sorted_passages:
      print(f"{passage[:40]} - Cosine distance {dist:.12f}")


## OKAY let's work with an actual large document

In [None]:
wikipedia_spacecraft = [
{
  "id": "37910",
  "title": "Spacecraft",
  "text": "A spacecraft is a vehicle that is designed to fly in outer space. A type of artificial satellite, spacecraft are used for a variety of purposes, including communications, Earth observation, meteorology, navigation, space colonization, planetary exploration, and transportation of humans and cargo. All spacecraft except single-stage-to-orbit vehicles cannot get into space on their own, and require a launch vehicle (carrier rocket). On a sub-orbital spaceflight, a space vehicle enters space and then returns to the surface without having gained sufficient energy or velocity to make a full Earth orbit. For orbital spaceflights, spacecraft enter closed orbits around the Earth or around other celestial bodies. Spacecraft used for human spaceflight carry people on board as crew or passengers from start or on orbit (space stations) only, whereas those used for robotic space missions operate either autonomously or telerobotically. Robotic spacecraft used to support scientific research are space probes. Robotic spacecraft that remain in orbit around a planetary body are artificial satellites. To date, only a handful of interstellar probes, such as Pioneer 10 and 11, Voyager 1 and 2, and New Horizons, are on trajectories that leave the Solar System. Orbital spacecraft may be recoverable or not. Most are not. Recoverable spacecraft may be subdivided by a method of reentry to Earth into non-winged space capsules and winged spaceplanes. Recoverable spacecraft may be reusable (can be launched again or several times, like the SpaceX Dragon and the Space Shuttle orbiters) or expendable (like the Soyuz). In recent years, more space agencies are tending towards reusable spacecraft. Humanity has achieved space flight, but only a few nations have the technology for orbital launches: Russia (RSA or \"Roscosmos\"), the United States (NASA), the member states of the European Space Agency (ESA), Japan (JAXA), China (CNSA), India (ISRO), Taiwan National Chung-Shan Institute of Science and Technology, Taiwan National Space Organization (NSPO), Israel (ISA), Iran (ISA), and North Korea (NADA). In addition, several private companies have developed or are developing the technology for orbital launches independently from government agencies. The most prominent examples of such companies are SpaceX and Blue Origin. ==History== A German V-2 became the first spacecraft when it reached an altitude of 189 km in June 1944 in Peenemünde, Germany.Peenemünde (Dokumentation) Berlin: Moewig, 1984.. Sputnik 1 was the first artificial satellite. It was launched into an elliptical low Earth orbit (LEO) by the Soviet Union on 4 October 1957. The launch ushered in new political, military, technological, and scientific developments; while the Sputnik launch was a single event, it marked the start of the Space Age.Dougall, Walter A. (Winter 2010) \"Shooting the duck\", American Heritage Apart from its value as a technological first, Sputnik 1 also helped to identify the upper atmospheric layer's density, by measuring the satellite's orbital changes. It also provided data on radio-signal distribution in the ionosphere. Pressurized nitrogen in the satellite's false body provided the first opportunity for meteoroid detection. Sputnik 1 was launched during the International Geophysical Year from Site No.1/5, at the 5th Tyuratam range, in Kazakh SSR (now at the Baikonur Cosmodrome). The satellite travelled at , taking 96.2 minutes to complete an orbit, and emitted radio signals at 20.005 and 40.002 MHz While Sputnik 1 was the first spacecraft to orbit the Earth, other human- made objects had previously reached an altitude of 100 km, which is the height required by the international organization Fédération Aéronautique Internationale to count as a spaceflight. This altitude is called the Kármán line. In particular, in the 1940s there were several test launches of the V-2 rocket, some of which reached altitudes well over 100 km. ==Spacecraft types== ===Crewed spacecraft=== thumb|Apollo 17 command module in Lunar orbit As of 2016, only three nations have flown crewed spacecraft: USSR/Russia, USA, and China. The first crewed spacecraft was Vostok 1, which carried Soviet cosmonaut Yuri Gagarin into space in 1961, and completed a full Earth orbit. There were five other crewed missions which used a Vostok spacecraft. The second crewed spacecraft was named Freedom 7, and it performed a sub-orbital spaceflight in 1961 carrying American astronaut Alan Shepard to an altitude of just over . There were five other crewed missions using Mercury spacecraft. Other Soviet crewed spacecraft include the Voskhod, Soyuz, flown uncrewed as Zond/L1, L3, TKS, and the Salyut and Mir crewed space stations. Other American crewed spacecraft include the Gemini spacecraft, the Apollo spacecraft including the Apollo Lunar Module, the Skylab space station, the Space Shuttle with undetached European Spacelab and private US Spacehab space stations- modules, and the SpaceX Crew Dragon configuration of their Dragon 2. US company Boeing also developed and flown a spacecraft of their own, the CST-100, commonly referred to as Starliner, but a crewed flight is yet to occur. China developed, but did not fly Shuguang, and is currently using Shenzhou (its first crewed mission was in 2003). Except for the Space Shuttle, all of the recoverable crewed orbital spacecraft were space capsules. File:NASA spacecraft comparison.jpg|alt=Drawings of Mercury, Gemini capsules and Apollo spacecraft, with their launch vehicles|American Mercury, Gemini, and Apollo spacecraft File:Vostok Spacecraft Diagram.svg|Soviet Vostok capsule File:Voskhod 1 and 2.svg|alt=Line drawing of Voskhod capsules|Soviet Voskhod (variant of Vostok) File:Soyuz 7K-OK(A) drawing.svg|alt=Soyuz 7K-OK(A) drawing|1967 Soviet/Russian Soyuz spacecraft File:Post S-7 Shenzhou spacecraft.png|alt=Drawing of Shenzhou spacecraft|Chinese Shenzhou spacecraft The International Space Station, crewed since November 2000, is a joint venture between Russia, the United States, Canada and several other countries. ====Spaceplanes==== thumb|Columbia orbiter landing Spaceplanes are spacecraft that are built in the shape of, and function as, airplanes. The first example of such was the North American X-15 spaceplane, which conducted two crewed flights which reached an altitude of over 100 km in the 1960s. This first reusable spacecraft was air-launched on a suborbital trajectory on July 19, 1963. The first partially reusable orbital spacecraft, a winged non-capsule, the Space Shuttle, was launched by the USA on the 20th anniversary of Yuri Gagarin's flight, on April 12, 1981. During the Shuttle era, six orbiters were built, all of which have flown in the atmosphere and five of which have flown in space. Enterprise was used only for approach and landing tests, launching from the back of a Boeing 747 SCA and gliding to deadstick landings at Edwards AFB, California. The first Space Shuttle to fly into space was Columbia, followed by Challenger, Discovery, Atlantis, and Endeavour. Endeavour was built to replace Challenger when it was lost in January 1986. Columbia broke up during reentry in February 2003. The first automatic partially reusable spacecraft was the Buran-class shuttle, launched by the USSR on November 15, 1988, although it made only one flight and this was uncrewed. This spaceplane was designed for a crew and strongly resembled the U.S. Space Shuttle, although its drop-off boosters used liquid propellants and its main engines were located at the base of what would be the external tank in the American Shuttle. Lack of funding, complicated by the dissolution of the USSR, prevented any further flights of Buran. The Space Shuttle was subsequently modified to allow for autonomous re-entry in case of necessity. Per the Vision for Space Exploration, the Space Shuttle was retired in 2011 mainly due to its old age and high cost of program reaching over a billion dollars per flight. The Shuttle's human transport role is to be replaced by SpaceX's SpaceX Dragon 2 and Boeing's CST-100 Starliner. Dragon 2's first crewed flight occurred on May 30, 2020. The Shuttle's heavy cargo transport role is to be replaced by expendable rockets such as the Space Launch System and ULA's Vulcan rocket, as well as the commercial launch vehicles. Scaled Composites' SpaceShipOne was a reusable suborbital spaceplane that carried pilots Mike Melvill and Brian Binnie on consecutive flights in 2004 to win the Ansari X Prize. The Spaceship Company will build its successor SpaceShipTwo. A fleet of SpaceShipTwos operated by Virgin Galactic was planned to begin reusable private spaceflight carrying paying passengers in 2014, but was delayed after the crash of VSS Enterprise. ===Uncrewed spacecraft=== Uncrewed spacecraft are spacecraft without people on board. Uncrewed spacecraft may have varying levels of autonomy from human input; they may be remote controlled, remote guided or even autonomous, meaning they have a pre-programmed list of operations, which they will execute unless otherwise instructed. Many space missions are more suited to telerobotic rather than crewed operation, due to lower cost and lower risk factors. In addition, some planetary destinations such as Venus or the vicinity of Jupiter are too hostile for human survival. Outer planets such as Saturn, Uranus, and Neptune are too distant to reach with current crewed spaceflight technology, so telerobotic probes are the only way to explore them. Telerobotics also allows exploration of regions that are vulnerable to contamination by Earth micro-organisms since spacecraft can be sterilized. Humans can not be sterilized in the same way as a spaceship, as they coexist with numerous micro-organisms, and these micro-organisms are also hard to contain within a spaceship or spacesuit. Multiple space probes were sent to study Moon, the planets, the Sun, multiple small Solar System bodies (comets and asteroids). Special class of uncrewed spacecraft is space telescopes, a telescope in outer space used to observe astronomical objects. The first operational telescopes were the American Orbiting Astronomical Observatory, OAO-2 launched in 1968, and the Soviet Orion 1 ultraviolet telescope aboard space station Salyut 1 in 1971. Space telescopes avoid the filtering and distortion (scintillation) of electromagnetic radiation which they observe, and avoid light pollution which ground-based observatories encounter. The best-known examples are Hubble Space Telescope and James Webb Space Telescope. Cargo spacecraft are designed to carry cargo, possibly to support space stations' operation by transporting food, propellant and other supplies. Automated cargo spacecraft have been used since 1978 and have serviced Salyut 6, Salyut 7, Mir, the International Space Station and Tiangong space station. ====Fastest spacecraft==== *Parker Solar Probe (estimated at first sun close pass, will reach at final perihelion) *Helios I and II Solar Probes () ==== Furthest spacecraft from the Sun ==== * Voyager 1 at 156.13 AU as of April 2022, traveling outward at about * Pioneer 10 at 122.48 AU as of December 2018, traveling outward at about *Voyager 2 at 122.82 AU as of January 2020, traveling outward at about *Pioneer 11 at 101.17 AU as of December 2018, traveling outward at about ==Subsystems== A spacecraft astrionics system comprises different subsystems, depending on the mission profile. Spacecraft subsystems comprise the spacecraft's bus and may include attitude determination and control (variously called ADAC, ADC, or ACS), guidance, navigation and control (GNC or GN&C;), communications (comms), command and data handling (CDH or C&DH;), power (EPS), thermal control (TCS), propulsion, and structures. Attached to the bus are typically payloads. ; Life support : Spacecraft intended for human spaceflight must also include a life support system for the crew. ; Attitude control : A Spacecraft needs an attitude control subsystem to be correctly oriented in space and respond to external torques and forces properly. The attitude control subsystem consists of sensors and actuators, together with controlling algorithms. The attitude- control subsystem permits proper pointing for the science objective, sun pointing for power to the solar arrays and earth pointing for communications. ; GNC : Guidance refers to the calculation of the commands (usually done by the CDH subsystem) needed to steer the spacecraft where it is desired to be. Navigation means determining a spacecraft's orbital elements or position. Control means adjusting the path of the spacecraft to meet mission requirements. ; Command and data handling : The C&DH; subsystem receives commands from the communications subsystem, performs validation and decoding of the commands, and distributes the commands to the appropriate spacecraft subsystems and components. The CDH also receives housekeeping data and science data from the other spacecraft subsystems and components, and packages the data for storage on a data recorder or transmission to the ground via the communications subsystem. Other functions of the CDH include maintaining the spacecraft clock and state-of-health monitoring. ; Communications : Spacecraft, both robotic and crewed, have various communications systems for communication with terrestrial stations and for inter-satellite service. Technologies include space radio station and optical communication. In addition, some spacecraft payloads are explicitly for the purpose of ground–ground communication using receiver/retransmitter electronic technologies. ; Power : Spacecraft need an electrical power generation and distribution subsystem for powering the various spacecraft subsystems. For spacecraft near the Sun, solar panels are frequently used to generate electrical power. Spacecraft designed to operate in more distant locations, for example Jupiter, might employ a radioisotope thermoelectric generator (RTG) to generate electrical power. Electrical power is sent through power conditioning equipment before it passes through a power distribution unit over an electrical bus to other spacecraft components. Batteries are typically connected to the bus via a battery charge regulator, and the batteries are used to provide electrical power during periods when primary power is not available, for example when a low Earth orbit spacecraft is eclipsed by Earth. ; Thermal control : Spacecraft must be engineered to withstand transit through Earth's atmosphere and the space environment. They must operate in a vacuum with temperatures potentially ranging across hundreds of degrees Celsius as well as (if subject to reentry) in the presence of plasmas. Material requirements are such that either high melting temperature, low density materials such as beryllium and reinforced carbon–carbon or (possibly due to the lower thickness requirements despite its high density) tungsten or ablative carbon–carbon composites are used. Depending on mission profile, spacecraft may also need to operate on the surface of another planetary body. The thermal control subsystem can be passive, dependent on the selection of materials with specific radiative properties. Active thermal control makes use of electrical heaters and certain actuators such as louvers to control temperature ranges of equipments within specific ranges. ; Spacecraft propulsion : Spacecraft may or may not have a propulsion subsystem, depending on whether or not the mission profile calls for propulsion. The Swift spacecraft is an example of a spacecraft that does not have a propulsion subsystem. Typically though, LEO spacecraft include a propulsion subsystem for altitude adjustments (drag make-up maneuvers) and inclination adjustment maneuvers. A propulsion system is also needed for spacecraft that perform momentum management maneuvers. Components of a conventional propulsion subsystem include fuel, tankage, valves, pipes, and thrusters. The thermal control system interfaces with the propulsion subsystem by monitoring the temperature of those components, and by preheating tanks and thrusters in preparation for a spacecraft maneuver. ; Structures : Spacecraft must be engineered to withstand launch loads imparted by the launch vehicle, and must have a point of attachment for all the other subsystems. Depending on mission profile, the structural subsystem might need to withstand loads imparted by entry into the atmosphere of another planetary body, and landing on the surface of another planetary body. ; Payload : The payload depends on the mission of the spacecraft, and is typically regarded as the part of the spacecraft \"that pays the bills\". Typical payloads could include scientific instruments (cameras, telescopes, or particle detectors, for example), cargo, or a human crew. ; Ground segment : The ground segment, though not technically part of the spacecraft, is vital to the operation of the spacecraft. Typical components of a ground segment in use during normal operations include a mission operations facility where the flight operations team conducts the operations of the spacecraft, a data processing and storage facility, ground stations to radiate signals to and receive signals from the spacecraft, and a voice and data communications network to connect all mission elements. ; Launch vehicle : The launch vehicle propels the spacecraft from Earth's surface, through the atmosphere, and into an orbit, the exact orbit being dependent on the mission configuration. The launch vehicle may be expendable or reusable. ==See also== *Astrionics *Commercial astronaut *Flying saucer *List of crewed spacecraft *List of fictional spacecraft *NewSpace *Spacecraft design *Space exploration *Space launch *Spaceships in science fiction *Space suit *Spaceflight records *Starship *Timeline of Solar System exploration *U.S. Space Exploration History on U.S. Stamps == References == === Citations === === Sources === * * ==External links== *NASA: Space Science Spacecraft Missions *NSSDC Master Catalog Spacecraft Query Form *Early History of Spacecraft *Basics of Spaceflight tutorial from JPL/Caltech *International Spaceflight Museum Category:Astronautics Category:Pressure vessels",
  "categories": [
    "Astronautics",
    "Pressure vessels"
  ]
},
 ]

## Step 7: Truncation is a problem for long texts

The semantic relevance will be low because most of the text is ignored in the vector computation.

In [None]:
text =        wikipedia_spacecraft[0]["text"]
embeddings =  e5_model.encode(text, normalize_embeddings=True)

tokenized_text =        e5_model.tokenizer(text)["input_ids"]
model_max_seq_length =  e5_model.get_max_seq_length()
text_token_count =      len(tokenized_text)

print(f"text tokens {text_token_count} | model max sequence length {model_max_seq_length}")

if text_token_count > model_max_seq_length:
    print(f"❗❗ The text will be truncated.❗❗")
else:
    print(f"The text will not be truncated.")

## Step 8: Visualizing Chunking Strategies

First some utility libraries

In [None]:
# Import Libraries
import os
import json
import textwrap
from pprint import pprint
from bs4 import BeautifulSoup
from IPython.display import HTML
#from elasticsearch import Elasticsearch, helpers
from langchain.text_splitter import RecursiveCharacterTextSplitter, \
  SentenceTransformersTokenTextSplitter, \
  CharacterTextSplitter, \
  TextSplitter

## Process splitting and display
def split_and_print(documents, splitter, ret=False):
    es_docs = []
    for doc in documents:
        passages = []

        for chunk in splitter.split_text(doc['text']):
            passages.append({
                "text": chunk,
            })
        es_docs.append(passages)

    print(f'Number of chunks: {len(passages)}' + '\n')
    display(HTML(process_chunks(passages)))
    if ret:
      return passages
    else:
      return False


## Character Splitter
def split_by_recursive_char(documents,
                  chunk_size: int = 200,
                  chunk_overlap: int = 0
                  ):
    '''Chunking by character count'''

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    split_and_print(documents, text_splitter)


def split_by_text(documents,
                  chunk_size: int = 200,
                  chunk_overlap: int = 0
                  ):
    '''Chunking by character count'''

    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    r = split_and_print(documents, text_splitter)



## Token Splitter
def split_by_token(documents,
                  tokens_per_chunk: int = 2,
                  chunk_overlap: int = 0,
                  ret=False
                 ):
    '''Chunking by BERT Transformer Tokens'''

    text_splitter = SentenceTransformersTokenTextSplitter(
        tokens_per_chunk=tokens_per_chunk,
        chunk_overlap=chunk_overlap,
        model_name='intfloat/e5-large-v2' # 512 token input limit
    )
    r = split_and_print(documents, text_splitter, ret=ret)
    if ret:
      return r




## Printing and Highlighting functions ##

color_list = [
    "yellow",
    "red",
    "lightgreen",
    "lightblue",
    "lightpink",
    "#F0A3FF",  # Vivid orchid
    "#0075DC",  # Blue ribbon
    "#2BCE48",  # Slimy green
    "#FFCC99",  # Peach-orange
    "#94FFB5",  # Mint green

]

def find_overlap(text1, text2):
    min_len = min(len(text1), len(text2))
    for i in range(min_len, 0, -1):
        if text1[-i:] == text2[:i]:
            return text1[-i:]
    return ''

###################################################################################
# Highted text -> White
# Normal text -> Black

### Uncomment these 3 functions if you are running in light mode

# def highlight_first_occurrence(text, substring, color):
#     index = text.find(substring)
#     if index != -1:
#         return (text[:index] +
#                 f"<span style='background-color: {color};'>{text[index:index+len(substring)]}</span>" +
#                 text[index+len(substring):])
#     return text

# def highlight_last_occurrence(text, substring, color):
#     index = text.rfind(substring)
#     if index != -1:
#         return (text[:index] +
#                 f"<span style='background-color: {color};'>{text[index:index+len(substring)]}</span>" +
#                 text[index+len(substring):])
#     return text

# def process_chunks(chunks, colors=color_list):
#     html_output = ""
#     for i in range(len(chunks) - 1):
#         overlap = find_overlap(chunks[i]["text"], chunks[i + 1]["text"])
#         color = colors[i % len(colors)]  # Cycle through the provided colors
#         if overlap:
#             chunks[i]["text"] = highlight_last_occurrence(chunks[i]["text"], overlap, color)
#             chunks[i + 1]["text"] = highlight_first_occurrence(chunks[i + 1]["text"], overlap, color)
#         html_output += chunks[i]["text"] + "<br><br>"
#     html_output += chunks[-1]["text"]  # Add the last chunk
#     return html_output

###################################################################################
# Highted text -> Black
# Normal text -> White

### Comment out these 3 functions if running in light modes

def highlight_first_occurrence(text, substring, color):
    index = text.find(substring)
    if index != -1:
        return (text[:index] +
                f"<span style='background-color: {color}; color: black;'>{text[index:index+len(substring)]}</span>" +
                text[index+len(substring):])
    return text

def highlight_last_occurrence(text, substring, color):
    index = text.rfind(substring)
    if index != -1:
        return (text[:index] +
                f"<span style='background-color: {color}; color: black;'>{text[index:index+len(substring)]}</span>" +
                text[index+len(substring):])
    return text


chunk_max_display = 10

def process_chunks(chunks, colors=color_list):
    html_output = ""
    for i in range(min(chunk_max_display -1,len(chunks) - 1)):
        overlap = find_overlap(chunks[i]["text"], chunks[i + 1]["text"])
        color = colors[i % len(colors)]  # Cycle through the provided colors
        if overlap:
            chunks[i]["text"] = highlight_last_occurrence(chunks[i]["text"], overlap, color)
            chunks[i + 1]["text"] = highlight_first_occurrence(chunks[i + 1]["text"], overlap, color)
        # Wrap each chunk of text in a span with white text color
        html_output += f"<span style='color: gray;'>{chunks[i]['text']}</span><br><br>"
    # Add the last chunk with white text color
    html_output += f"<span style='color: gray;'>{chunks[-1]['text']}</span>"
    html_output += f"<br/><br/><span style='color: gray;'>... additional chunks omitted</span>"
    return html_output

## Step 9: Three Chunking Strategies

[LangChain recursive character text splitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)

[LangChain splitting by tokens](https://python.langchain.com/docs/modules/data_connection/document_transformers/split_by_token)

In [None]:
split_by_recursive_char(wikipedia_spacecraft, chunk_size=1024, chunk_overlap=0)

In [None]:
split_by_recursive_char(wikipedia_spacecraft, chunk_size=1024, chunk_overlap=50)

In [None]:
token_c500_o0 = split_by_token(wikipedia_spacecraft, tokens_per_chunk=500, chunk_overlap=0, ret=True)

In [None]:
token_c500_o250 = split_by_token(wikipedia_spacecraft, tokens_per_chunk=500, chunk_overlap=100, ret=True)

## Step 10: Let's comapare using the whole passage vs the best chunk with ChatGPT

In [None]:
the_full_text = wikipedia_spacecraft[0]["text"]

question = "What three countries have flown manned spacecraft?"

def gen_system_prompt(context):
  return f"""You are an AI assistant than answers questions based on the provided context.
Use only the provided context.  If the provided context does not have the answer
reply only with 'I do not know'

Context: {context}"""

import textwrap
# wrap text when printing, because colab scrolls output to the right too much
def wrap_text(text, width):
    wrapped_text = textwrap.wrap(text, width)
    return '\n'.join(wrapped_text)

def print_light_blue(text):
    print(f'\033[94m{text}\033[0m')

def chatCompletion(messages):

    client = OpenAI(api_key=openai.api_key, base_url=openai.api_base)
    completion = client.chat.completions.create(
        model=openai.default_model,
        max_tokens=150,
        messages=messages
    )
    print_light_blue(f"\t{completion.usage}")

    return completion

def chatWithSpacePassage(prompt, context):
    messages = [
        {"role": "system", "content": gen_system_prompt(context)},
        {"role": "user", "content": prompt}
      ]
    print_light_blue("Prompt:")
    print_light_blue(wrap_text(messages[1]["content"],70))
    completion = chatCompletion(messages)

    response_text = completion.choices[0].message.content

    return wrap_text(response_text,70)


ai_response = chatWithSpacePassage(question, the_full_text)

print(ai_response)


## Step 11: Reducing LLM inference costs by 91%

We'll deep dive into how to use Elasticearch to speed up the vector search and other kinds of Search Powered AI in the next part of the workshop.



In [None]:
# the_full_text = wikipedia_spacecraft[0]["text"]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=50,
    length_function=len
)

chunks = text_splitter.split_text(the_full_text)

## Vectorizing can take time so I'm only processing the first few chunks
sorted_chunks = chunks_by_distance(chunks[:5], question, e5_model)

## top 3 chunk distances
for passage, dist in sorted_chunks[:3]:
  print(f"{passage[:40]} - Cosine distance {dist:.12f}")
print("")

top_passage = sorted_chunks[0][0]
print(wrap_text(top_passage, 70))
print("")

ai_response = chatWithSpacePassage(question, top_passage)
print(ai_response)