In [21]:
import os
import boto3
import logging
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline   
from langchain_community.document_loaders import TextLoader
from dotenv import load_dotenv
from huggingface_hub import HfApi
load_dotenv()

True

In [22]:

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

HF_TOKEN = os.getenv("HUGGING_FACE_TOKEN")


DATA_PATH = ("../data")

print("Current Working Directory:", DATA_PATH)

Current Working Directory: ../data


In [23]:
import os
import boto3

def read_file_content(source, path):
    """
    Reads the content of a file based on its source (local or S3).

    Args:
        source (str): "local" or "s3".
        path (str): File path (for local) or bucket/key (for S3).
    
    Returns:
        str: The content of the file.
    """
    if source == "local":
        with open(path, "r", encoding="utf-8") as file:
            return file.read()
    elif source == "s3":
        bucket, key = path.split("/", 1)  # Format: "bucket/key"
        s3 = boto3.client("s3")
        response = s3.get_object(Bucket=bucket, Key=key)
        return response["Body"].read().decode("utf-8")
    else:
        raise ValueError("Invalid source. Use 'local' or 's3'.")


def load_files_to_dict(base_path, source="local", prefix=""):
    """
    Loads text files from a source into a dictionary.

    Args:
        source (str): "local" or "s3".
        base_path (str): Local folder path or S3 bucket name.
        prefix (str): Prefix for filtering files in S3 (ignored for local).
    
    Returns:
        dict: A dictionary with filenames as keys and file contents as values.
    """
    files_dict = {}

    if source == "local":
        # Process local files
        for filename in os.listdir(base_path):
            file_path = os.path.join(base_path, filename)
            if os.path.isfile(file_path) and filename.endswith(".txt"):
                files_dict[filename] = read_file_content(source, file_path)
                
    elif source == "s3":
        # Process S3 files
        s3 = boto3.client("s3")
        response = s3.list_objects_v2(Bucket=base_path, Prefix=prefix)
        for obj in response.get("Contents", []):
            key = obj["Key"]
            if key.endswith(".txt"):
                files_dict[key] = read_file_content(source, f"{base_path}/{key}")
    else:
        raise ValueError("Invalid source. Use 'local' or 's3'.")

    return files_dict

In [24]:
"""def load_documents():
    try:
        logger.info(f"Loading documents from {DATA_PATH}")
        print("Current Working Directory:", DATA_PATH)
        if not os.path.exists(DATA_PATH):
            logger.warning(f"Data path {DATA_PATH} does not exist.")
            return []
        documents = []
        for filename in os.listdir(DATA_PATH):
            print("Current Working Directory:", (os.getcwd()+"\data"))
            if filename.endswith(".txt"):
                loader = TextLoader(os.path.join(DATA_PATH, filename))
                documents.extend(loader.load())
        logger.info(f"Loaded {len(documents)} documents.")
        return documents
    except Exception as e:
        logger.error(f"Error loading documents: {e}")
        return []"""



In [25]:
documents = load_files_to_dict(DATA_PATH)

In [26]:
print(documents)

{'data_alex_mozerski.txt': "<|startoftext|>[INST] What is Alex Mozerski's primary focus in his career as a data scientist? [/INST] Alex Mozerski is a data scientist with expertise in machine learning, predictive modeling, and data visualization. He specializes in analyzing complex datasets to extract meaningful insights and supports decision-making through data-driven solutions. <|endoftext|>\n<|startoftext|>[INST] How does Alex Mozerski leverage data science to solve problems? [/INST] Alex Mozerski is a data scientist with expertise in machine learning, predictive modeling, and data visualization. He specializes in analyzing complex datasets to extract meaningful insights and supports decision-making through data-driven solutions. <|endoftext|>\n<|startoftext|>[INST] What industries has Alex Mozerski worked in as a data scientist? [/INST] Alex Mozerski is a data scientist with expertise in machine learning, predictive modeling, and data visualization. He specializes in analyzing compl

In [27]:
# Step 2: Initialize the embedding model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Small and efficient model for embeddings

# Step 3: Create FAISS vector store
vector_store = FAISS.from_texts(documents, embeddings)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [28]:
from transformers import pipeline
import torch
from accelerate import disk_offload


model_id = "meta-llama/Llama-3.2-1B"

disk_offload(model=model_id, offload_dir="offload")


pipe = pipeline(
    "text-generation", 
    model=model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto",
    token=HF_TOKEN,
    low_cpu_mem_usage = True
)

pipe("Who is alex ")

AttributeError: 'str' object has no attribute 'state_dict'

In [None]:
from transformers import pipeline
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS


# Créer le pipeline sans charger explicitement le tokenizer
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    offload_dir=os.path.join(os.getcwd(), "offload"),
    token=HF_TOKEN
)


# Initialiser le modèle d'embedding
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Créer le vector store FAISS
vector_store = FAISS.from_texts(documents, embeddings)

# Effectuer une recherche de similarité
query = "Who is Alex?"
docs = vector_store.similarity_search(query, k=5)
context = " ".join([doc.page_content for doc in docs])

# Préparer le prompt avec le contexte récupéré
prompt = f"{context}\n\nQuestion: {query}\nAnswer:"

# Générer la réponse en utilisant le pipeline
response = pipe(prompt, max_length=512, do_sample=False)
print(response[0]['generated_text'])


In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer


# Load the model with offloading
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",               # Enable automatic device mapping
    offload_folder="./offload_dir",  # Directory for offloading to disk
    torch_dtype="float16",           # Use mixed precision to save memory
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Create the pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype="float16",  # Ensure consistent dtype
)

# Test the pipeline
response = pipe("Hello, world!", max_length=50)
print(response[0]["generated_text"])


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Step 1: Load documents
documents = load_files_to_dict(DATA_PATH) # Dictionary with filenames as keys and file contents as values (3 items)

# Step 2: Initialize embedding model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Step 3: Create FAISS vector store
vector_store = FAISS.from_texts(documents, embeddings)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    offload_folder="./offload_dir",
    torch_dtype="float16",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype="float16",
)

# Step 5: Query and retrieve context
query = "What is RAG?"
docs = vector_store.similarity_search(query, k=2)  # Retrieve top 2 documents
context = " ".join([doc.page_content for doc in docs])

# Step 6: Generate answer
prompt = f"{context}\n\nQuestion: {query}\nAnswer:"
response = pipe(prompt, max_length=512, do_sample=False)
print(response[0]["generated_text"])

In [31]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Step 1: Load documents
texts = [content for content in documents.values()]  # Extract text content

model_id = "meta-llama/Llama-3.2-1B"


model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id


# Step 2: Initialize embedding model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Step 3: Create FAISS vector store
vector_store = FAISS.from_texts(texts, embeddings)

# Step 4: Set up MMR-based retriever
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 2, "lambda_mult": 0.5}  # Adjust `k` and `lambda_mult` as needed
)

# Step 5: Load model and tokenizer for generation
"""model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    offload_folder="./offload_dir",
    torch_dtype="float16",

)"""
model = AutoModelForCausalLM.from_pretrained(model_id)

tokenizer = AutoTokenizer.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype="float16",
)
print("Model loaded")

# Step 6: Query and retrieve documents
query = "What is Alex Mozerski's hobbies?"
docs = retriever.invoke(query)
context = " ".join([doc.page_content for doc in docs])  # Combine document content
print("retrieved documents")


# Step 7: Generate answer
prompt = f"""
Context:
{context}

Question:
{query}

Answer:
"""

response = pipe(prompt, max_new_tokens=80, do_sample=True, temperature=0.7)

print("Answer:LOADED")

generated_text = response[0]["generated_text"]
answer = generated_text.split("Answer:")[-1].strip()
print(answer)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Model loaded


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


retrieved documents
Answer:LOADED
Alex Mozerski's favorite movie is "The Godfather" because it's a classic and a timeless story about loyalty and family.

Question:
What is Alex Mozers
