In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!ls

drive  sample_data  vintage_ai


In [4]:
# !git clone https://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@github.com/e-candeloro/vintage_ai.git

In [3]:
%cd drive/MyDrive/Colab Notebooks/vintage_ai

/content/drive/MyDrive/Colab Notebooks/vintage_ai


In [6]:
# !git checkout -b michelle_branch


fatal: A branch named 'michelle_branch' already exists.


In [None]:
!git status

In [None]:
# !pip install uv

In [5]:
!ls

documents  notebooks  pyproject.toml  README.md  src  uv.lock


In [10]:
# !git add notebooks/RAG_example.ipynb
# !git commit -m "Add RAG"
# !git push origin michelle_branch


In [None]:
# packages for RAG

!pip install -q torch transformers accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl pacmap datasets ragatouille
!pip install langchain_community
!pip install -U bitsandbytes
!pip install pacmap
!pip install langchain_huggingface
!pip install langgraph

In [7]:
import getpass
import os
from langchain.chat_models import init_chat_model

# SET THE ENVIRNMONENTS for TRAKING on LANGSMITH

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "RAG_example"
print(os.environ.get("LANGSMITH_PROJECT"))

if "LANGSMITH_API_KEY" not in os.environ:
    os.environ["LANGSMITH_API_KEY"] = "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"



RAG_example


# INIZIALIZZAZIONE DI TUTTI GLI STEP CHE MI SERVONO:
  - LLM usato
  - Embedding model per i documenti
  - Database vettoriale con ricerca FAISS di similarità


## LLm

In [None]:
# INIT THE LLM MODEL using hugging face
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

model_name = "HuggingFaceH4/zephyr-7b-beta"

# to be more fast
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)



## Template del prompt modificabile


In [14]:
# prompt that can be changed on the basis of the task

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} template='\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n{context}\n\n</s>\n<|user|>\n{question}\n</s>\n<|assistant|>\n\n '


## Embeddings

In [None]:
# INIT EMBEDDING MODEL
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

## VECTOR DB

In [33]:
# INIT VECTOR STORE, I chose to use FAISS

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain import hub
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
import re
from bs4 import BeautifulSoup
import bs4
from langchain_community.document_loaders import RecursiveUrlLoader

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml",
                        parse_only=bs4.SoupStrainer(class_ = ("discussionListItems")))

    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

loader = RecursiveUrlLoader(
    "https://www.ferrarichat.com/forum/forums/cars.303/",
    max_depth = 1,
    prevent_outside=True,
    extractor=bs4_extractor,
)
print(loader.load()[0].page_content[:200])

# questo sarebbe un post con le sue metriche

For Sale
Attn Collectors!  The finest, most original and most expensive Testarossa anywhere.

AMG USA,
					
Mar 3, 2025 at 1:54 PM

Replies: 16
Views: 4,707

Motob

Jun 4, 2025 at 8:49 PM

For Sale
1


In [21]:
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

vector_store = FAISS.from_documents(all_splits, embeddings)


# LANGCHAIN GRAPH: per tenere traccia di tutta la pipeline
 - Retrieve: Similarity search
 - Generate: Chiamata all'LLM con documenti trovati nel vector db e domanda
 - Effettiva chiamata alla pipeline

In [49]:



# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [50]:
response = graph.invoke({"question": "What are the most popular themes in the classic car comunity in the last days?"})
print(response["answer"])




<|system|>
Answer the question based on your knowledge. Use the following context to help:

AMG USA,
					
Apr 21, 2025 at 3:31 PM

Replies: 6
Views: 1,649

bertrand328

Apr 22, 2025 at 2:14 PM

For Sale
YC SUPERCARS LAUNCHES A NEW CONSULTING SERVICE FOR BUYERS!!

Yellow Compass,
					
Aug 30, 2023 at 10:25 PM

...
2
3
4

Replies: 84
Views: 23,182

Yellow Compass

Apr 22, 2025 at 12:56 PM

Sold
ICE ICE BABY!! LIKE VANILLA-2011 458 COUPE WHITE/TAN 10,000 miles!!

Yellow Compass,
					
Feb 9, 2025 at 2:32 PM

Replies: 12
Views: 4,583

Yellow Compass

Apr 19, 2025 at 8:47 AM

For Sale
1998 Ferrari F355 - 6 speed manual + Canna Di Fucile

SnackGuy,
					
Oct 8, 2024 at 11:41 AM

Replies: 8
Views: 5,664

Joshman0531

Apr 18, 2025 at 7:48 PM

Taken Off Market
2013 458 SPIDER WHITE/BlACK 22,777 mi.

Yellow Compass,
					
Feb 11, 2025 at 11:23 PM

Replies: 10
Views: 4,096

Yellow Compass

Apr 17, 2025 at 5:24 PM

For Sale
1989 Ferrari Testarossa - White/Tan, Cavallino Platinum

GLENN@TEAM AI,

In [28]:
!git config --global user.email "paganimichelle0499@gmail.com"
!git config --global user.name "michelle2399"

In [31]:
!git add notebooks/RAG_example.ipynb
!git commit -m "Add RAG riprova"
!git push origin michelle_branch


On branch michelle_branch
nothing to commit, working tree clean
Everything up-to-date
