<a href="https://colab.research.google.com/github/ckabelin/daai/blob/main/VTD_DAAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Welcome to the Ventum Team Day AI Tutorial

In this tutorial you will learn
- Include the most common python libs for ai and data analytics
- Use several ai methods to chat and integrate data
- Learn to code your ai-use-case based on open source and/or open ai

## Usefull reads and links:
* https://medium.com/@akriti.upadhyay/implementing-rag-with-langchain-and-hugging-face-28e3ea66c5f7
* https://github.com/ckabelin/daai
* https://huggingface.co/settings/gated-repos
* https://www.langchain.com/
* https://openai.com/
* https://github.com/huggingface
* https://huggingface.co/docs/hub/datasets-downloading

# 1. Install Dependencies

## 1.1 Install os prerequisites

In [None]:
!apt-get update

In [None]:
!apt-get upgrade -y

In [None]:
!apt-get install libarchive-dev python3-doc python3-pil.imagetk python-pil-doc libleptonica-dev tesseract-ocr libtesseract-dev python3-pil tesseract-ocr-eng tesseract-ocr-script-latn

In [None]:
!apt-get install libomp-dev

In [None]:
!apt-get --fix-missing install

In [None]:
!apt-get --fix-broken install

In [None]:
!apt-get install poppler-utils

In [9]:
!apt-get clean

## 1.2 Install Python Requirements

In [None]:
!rm requirements.txt

In [None]:
!curl -H "Cache-Control: no-cache, no-store" "https://raw.githubusercontent.com/ckabelin/daai/refs/heads/main/requirements-slim.txt" > requirements.txt

In [None]:
!pip install --upgrade --quiet pip

In [None]:
!pip install --upgrade --quiet -r requirements.txt

# 2. Library Imports

## 2.1 Base Google Dependencies

In [14]:
import os
import sys
import json

import base64
import requests

from google.colab import userdata
from google.colab import files

import datasets
from pathlib import Path

# 3. Define AI Framework

## 3.1 Define basic env variables

In [15]:
os.environ["OPENAI_API_KEY"] = userdata.get('OAI_TOKEN')

In [16]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('HF_TOKEN')

In [17]:
os.environ['HF_HOME'] = "/root/.cache/huggingface/hub/" # "~/.chache/.huggingface/" # "./drive/MyDrive/huggingface/cache/"

os.environ['HUGGINGFACE_HUB_CACHE'] = os.environ['HF_HOME']
os.environ['HF_DATASETS_CACHE'] = os.environ['HF_HOME']
os.environ['TRANSFORMERS_CACHE'] = os.environ['HF_HOME']

In [18]:
datasets.config.DOWNLOADED_DATASETS_PATH = Path(os.environ['HF_HOME'])
datasets.config.HF_DATASETS_CACHE = os.environ['HF_HOME']

## 3.2 Define Model and Parameters

In [19]:
model_llm: str = "meta-llama/Llama-3.2-3B-Instruct"

In [20]:
model_llm: str = "meta-llama/Llama-3.2-1B-Instruct"

In [21]:
model_embeddings: str = "sentence-transformers/all-MiniLM-L6-v2"

In [22]:
model_embeddings: str = "sentence-transformers/all-mpnet-base-v2"

In [23]:
max_tokens_limit: int = 20000
temperature: float = 1e-10 # 0.1
top_p: float = 0.9

## 3.3 Define global ai variables

In [24]:
llm = None
embeddings = None
db = None
retriever = None

chat_history = None
memory = None
llm_chain = None

# 4 Initialize AI

## 4.1. Common ai & langchain imports

### 4.1.1 Pydantic allows for object-oriented semantic prompting into data structures (objects)

In [24]:
from pydantic import BaseModel, Field, validator

### 4.1.2 Import langchain dependencies

In [28]:
from langchain import PromptTemplate, LLMChain
from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.prompts.base import BasePromptTemplate

from langchain.schema import HumanMessage, SystemMessage, AIMessage

from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate

from langchain.memory import ConversationBufferMemory
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

from langchain_core.output_parsers import JsonOutputParser
from langchain.output_parsers import PydanticOutputParser

### 4.1.3 Hugging Face imports

In [29]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain_huggingface import HuggingFaceEndpointEmbeddings

from langchain import HuggingFaceHub
from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings

from langchain.document_loaders import HuggingFaceDatasetLoader

from huggingface_hub import login, whoami
from huggingface_hub import snapshot_download

from langchain_huggingface.llms import HuggingFacePipeline

In [27]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering



### 4.1.4 OpenAI Imports

In [30]:
import openai

# from langchain.llms import OpenAI
# from langchain.chat_models import ChatOpenAI
from langchain_openai import OpenAI, ChatOpenAI
from langchain_openai import OpenAIEmbeddings

### 4.1.5 Liquid AI Imports

In [31]:
import liquidai

## 4.2 Initialize LLM and Embeddings (Hugging Face)

In [None]:
whoami(token=os.environ['HUGGINGFACEHUB_API_TOKEN'])

In [32]:
login(token=os.environ['HUGGINGFACEHUB_API_TOKEN'])

### 4.2.1 LLM

#### 4.2.1.1 Online LLM

In [17]:
def llm_huggingface_online():
  global llm

  llm_endpoint = HuggingFaceEndpoint(
      repo_id=model_llm,
      task="text-generation",
      # max_new_tokens=512,
      do_sample=False,
      repetition_penalty=1.03,
      temperature=temperature
  )

  llm = ChatHuggingFace(llm=llm_endpoint)

#### 4.2.1.2 Offline LLM

In [18]:
def llm_huggingface_offline():
  global llm
  snapshot_download(repo_id=model_llm)
  llm = HuggingFacePipeline.from_model_id(
      model_id=model_llm,
      task="text-generation",
      pipeline_kwargs={
          "token": os.environ['HUGGINGFACEHUB_API_TOKEN'],
          # "max_new_tokens": 100,
          # "top_k": 50,
          "temperature": temperature,
      }
  )
  llm.pipeline.tokenizer.pad_token_id = llm.pipeline.tokenizer.eos_token_id

### 4.2.2 Embeddings

#### 4.2.2.1 Online Embeddings

In [19]:
def embeddings_huggingface_online():
  global embeddings
  embeddings = HuggingFaceEndpointEmbeddings(
      model=model_embeddings,
      task="feature-extraction"
  )

#### 4.2.2.2 Offline Embeddings

In [20]:
def embeddings_huggingface_offline():
  global embeddings
  snapshot_download(repo_id=model_embeddings)
  embeddings = HuggingFaceEmbeddings(
    model_name=model_embeddings
  )

## 4.3 Initialize LLM and Embeddings (Open AI)

In [33]:
def llm_openai():
  global llm
  llm = ChatOpenAI(
      api_key = os.environ.get("OPENAI_API_KEY"),
      model_name = "gpt-4o",
      temperature = temperature,
      top_p = top_p
  )

In [34]:
def embeddings_openai():
  global embeddings
  embeddings = OpenAIEmbeddings(
      api_key = os.environ.get("OPENAI_API_KEY"),
      model="text-embedding-3-large"
  )

## 4.4 Initialize LLM and Embedding Instances

### 4.4.1 Initialize Huggingface offline

In [None]:
llm_huggingface_offline()
embeddings_huggingface_offline()

### 4.4.2 Initialize Huggingface online

In [33]:
llm_huggingface_online()
embeddings_huggingface_online()

### 4.4.3 Initialize OpenAI

In [35]:
llm_openai()
embeddings_openai()

# 5. Initialize Vector DB (in Memory via FAISS)

In [36]:
docs = []
loader = None

## 5.1 Get Data

### 5.1.1 Get data from url
e. g. https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:32024R1689

In [53]:
url = "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=OJ:L_202401689"

In [49]:
url = "https://www.bund.net/fileadmin/user_upload_bund/publikationen/nachhaltigkeit/nachhaltigkeit_gutes_leben_sdgs.pdf"

In [51]:
url = "https://www.bundesregierung.de/resource/blob/975274/1873516/6c607bb5f16993ef18440d9e0dae55cb/2021-03-10-dns-2021-finale-langfassung-barrierefrei-data.pdf?download=1"

In [54]:
loader = OnlinePDFLoader(url)

### 5.1.2 Get data from files

In [37]:
base_path = "./drive/MyDrive/sample_data/"

In [None]:
path = base_path + "nachhaltigkeit"

In [38]:
path = base_path + "euaiact"

In [39]:
loader = PyPDFDirectoryLoader(path)

## 5.2 Prepare text content

In [40]:
chunk_size = 1000
chunk_overlap = 30

In [41]:
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n")
docs = text_splitter.split_documents(documents)

print("Docs loaded: " + str(len(docs)))

Docs loaded: 792


## 5.3 Create embeddings and build vector db

In [42]:
db = FAISS.from_documents(docs, embeddings)

In [43]:
retriever = db.as_retriever()

## 6. Initialize misc basics like question set

In [44]:
questions = []

In [45]:
def add(question: str):
  global questions
  questions.append(question)

# 7. Define your questions

In [46]:
questions = []

In [47]:
add("Verfasse eine Zusammenfassung in 500 Worten oder weniger.")

In [48]:
add("Fasse die Bedeutsamsten Punkte in maximal 20 Stichpunkten mit kurzer Erklärung zusammen.")

In [49]:
add("Fasse §66 in weniger als 100 Worten zusammen.")

In [50]:
add("Erstelle einen ausführlichen Vorschlag in Stichpunkten, wie sich eine KI nach den Kriterien des EU AI Acts gründlich bewerten und einschätzen lässt.")

# 8. Execute

## 8.1 Initialize Chat History

In [51]:
chat_history = InMemoryChatMessageHistory()

## 8.2 Initialize Chat Memory

In [None]:
memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True,
    output_key='answer',
    chat_memory=chat_history
)

## 8.3 Define Priming Prompt

In [53]:
priming: str = """You are an top-tier lawyer specializing in digital laws and especially ai.
You analyze and interpret legal text with the highest proficiency and explain them in simple and easy to understand terms.
Always respond in german."""

## 8.4 Define Messages (system and human)

In [54]:
system_message_prompt = SystemMessagePromptTemplate.from_template(
    priming + " The context is:\n{context}"
)
human_message_prompt = HumanMessagePromptTemplate.from_template(
    "{question}"
)

## 8.5 Initialize Conversation Chain




In [55]:
llm_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    verbose=False,
    combine_docs_chain_kwargs={
        "prompt": ChatPromptTemplate.from_messages([
            system_message_prompt,
            human_message_prompt,
        ]),
    },
    return_source_documents=True,
    max_tokens_limit=max_tokens_limit
)

## 8.6 All-in-one to accomodate huggingface restrictions

### 8.6.1 Define ask function with reinit for each ask (saves memory)

In [30]:
def ask(question: str) -> str:
  global llm_chain
  global chat_history
  global memory

  chat_history = InMemoryChatMessageHistory()

  memory = ConversationBufferMemory(
      memory_key='chat_history',
      return_messages=True,
      output_key='answer',
      chat_memory=chat_history
  )

  llm_chain = ConversationalRetrievalChain.from_llm(
      llm=llm,
      retriever=retriever,
      memory=memory,
      verbose=False,
      combine_docs_chain_kwargs={
          "prompt": ChatPromptTemplate.from_messages([
              system_message_prompt,
              human_message_prompt,
          ]),
      },
      return_source_documents=True
  )

  response = llm_chain.invoke(question)
  chat_history = response["chat_history"]
  last: AIMessage = chat_history[-1]
  answer = last.content
  return answer

### 8.6.2 Define ask function keeping session memory

In [56]:
def ask(question: str) -> str:
  global llm_chain
  global chat_history
  global memory

  response = llm_chain.invoke(question)
  chat_history = response["chat_history"]
  last: AIMessage = chat_history[-1]
  answer = last.content
  return answer

## 8.7 Shoot questions

In [None]:
for question in questions:
  answer = ask(question)
  print("Question: " + question)
  print("Answer: ")
  print(answer)
  print("")

## 8.8 Interactive prompt

In [None]:
again: bool = True
while(again):
  result = input("Enter your Question: ")
  value = str(result)
  if value == "exit":
    again = False
  if again:
    question: str = value
    answer = ask(question)
    print("Question: " + question)
    print("Answer: ")
    print(answer)
    print("")

# 9. Advanced Concepts

## 9.1 Working with data structures

### 9.1.1 Define Base classes

In [56]:
from typing import Deque, List, Optional, Tuple

In [57]:
class Artikel(BaseModel):
  artikel: int = Field(description="Nummer des Artikels")
  headline: str = Field(description="Titel des Artikels")
  summary: str = Field(description="Zusammenfassung in weniger als 50 Worten")

class Abschnitt(BaseModel):
  abschnitt: int = Field(description="Nummer des Abschnitts")
  headline: str = Field(description="Titel des Abschnitts")
  summary: str = Field(description="Zusammenfassung in weniger als 50 Worten")
  artikel: List[Artikel] = Field(description="Alle Artikel des Abschnitts")

class Kapitel(BaseModel):
  kapitel: int = Field(description="Nummer des Kapitels")
  headline: str = Field(description="Titel des Kapitels")
  summary: str = Field(description="Zusammenfassung in weniger als 50 Worten")
  abschnitte: List[Abschnitt] = Field(description="Alle Abschnitte des Kapitels")

class AutoSummaryResult(BaseModel):
  name: str = Field(description="Titel des Dokuments")
  author: str = Field(description="Author des Dokuments")
  chapters: List[Kapitel] = Field(description="Liste aller Kapitel")


### 9.1.2 Ask the AI

In [58]:
# Build the parser for the Data Model
parser = PydanticOutputParser(pydantic_object=AutoSummaryResult)

# Define the prompt template
template_text = priming + """

    {format_instructions}

    SUMMARY: {summaries}

    QUESTION: {question}"""

summary_query = "Gebe ausführliche und korrekte Informationen zu dem Text"

prompt = PromptTemplate(
    template=template_text,
    input_variables=["question"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# Build the retrieval chain
qa = RetrievalQAWithSourcesChain.from_chain_type(
    llm,
    chain_type="stuff",
    retriever=retriever,
    memory=memory,
    chain_type_kwargs={
        "prompt": prompt,
    },
)

In [None]:
as_result = ""
try:
  # ask the question and process the result
  summary = qa.invoke({"question": summary_query})
  res = summary["answer"]
  if res.startswith("```json"):
      res = res[7:-3]
  # res = json.dumps(res, indent = 4)
  as_result = str(res)
  print(as_result)
except Exception as ex:
  error = str(ex)
  print(error)

## 9.2 Working with data sets

In [None]:
# Specify the dataset name and the column containing the content
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"  # or any other column you're interested in

# Create a loader instance
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

# Load the data
data = loader.load()

print("Datasets loaded: " + str(len(data)))

# Display the first 15 entries
data[:2]