# Introduction

In [None]:
pip install openai langchain langchain-community pinecone pinecone-client python-dotenv langchain-pinecone tiktoken sentence_transformers chromadb grpc-gateway-protoc-gen-openapiv2

Collecting openai
  Downloading openai-1.51.0-py3-none-any.whl.metadata (24 kB)
Collecting langchain
  Downloading langchain-0.3.2-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Collecting pinecone
  Downloading pinecone-5.3.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting chromadb
  Downloading chromadb-0.5.11-py3-none-any.whl.metadata (6.8 kB)
Collecting grpc-gateway-protoc-

In [None]:
import openai
import langchain
import os
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
import logging

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv('.env')

logging.basicConfig(level=logging.DEBUG,
                    format='[%(levelname)s] - %(message)s ',
                    handlers=[
                        logging.FileHandler('/content/langchaindemo.log', mode='w'),
                        logging.StreamHandler(),
                    ],
                    force=True)
logger = logging.getLogger(__name__)
logger.info("Langchain Demo Initialized")

[INFO] - Langchain Demo Initialized 


# Data load

In [None]:
import requests
from bs4 import BeautifulSoup
import re

# Función para limpiar el contenido
def clean_text(text):
    # Eliminar anotaciones como [1], [2], etc., y caracteres Unicode no deseados
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\\u[0-9A-Fa-f]{4}', '', text)  # Remover caracteres Unicode como '\u200b'

    # Eliminar contenido específico irrelevante (ejemplo: "Firma[editar datos en Wikidata]")
    text = re.sub(r'Firma\[editar datos en Wikidata\]', '', text)
    text = re.sub(r'\[editar datos.*?\]', '', text)

    # Corregir palabras pegadas por presencia de símbolos como ',', 'y', '.'
    text = re.sub(r'([a-zA-Z])([,.;])([a-zA-Z])', r'\1\2 \3', text)
    text = re.sub(r'([a-zA-Z])([A-Z])', r'\1 \2', text)  # Insertar espacio cuando hay palabras pegadas

    # Reemplazar múltiples espacios o saltos de línea por un solo espacio
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def get_wiki_content(page):
  url = "https://es.wikipedia.org/wiki/" + page
  response = requests.get(url)
  if response.status_code != 200:
    print(f"Error fetching the page: {response.status_code}")
    return None

  soup = BeautifulSoup(response.text, 'html.parser')
  body_content = soup.find('div', class_='mw-parser-output')
  # paragraphs = body_content.find_all('p')
  # content = "\n".join([para.get_text(strip=True) for para in paragraphs])
  content = body_content.get_text(strip=True)

  content = clean_text(content)

  return content

# docs = get_wiki_content("Elon_Musk")

In [None]:
import hashlib

def calcular_hash_sha256(cadena):
    # Convertir la cadena a bytes
    cadena_bytes = cadena.encode('utf-8')

    # Crear un objeto hash SHA-256
    hash_object = hashlib.sha256(cadena_bytes)

    # Obtener el hash como cadena hexadecimal
    hash_hex = hash_object.hexdigest()

    return hash_hex

# Pre-processing

In [None]:
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.chat_models import ChatOpenAI
from langchain_community.llms import OpenAI, HuggingFaceHub
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
import pandas as pd
from langchain_core.documents import Document

def split_text(text):
    """
    Get chunks from docs. Our loaded doc may be too long for most models, and even if it fits is can struggle to find relevant context. So we generate chunks
    :param docs: docs to be split
    :return: chunks
    """

    text_splitter = RecursiveCharacterTextSplitter( # recommended splitter for generic text
        chunk_size=2000,
        chunk_overlap=200,
        add_start_index=True
    )
    chunks = text_splitter.split_text(text)

    return chunks

# chunks = split_text(docs)
# print(chunks)

def get_docs(persons):
  docs = []

  for person in persons:
    wiki_content = get_wiki_content(person['wiki_page'])
    print(wiki_content)
    if wiki_content:
      wiki_chunks = split_text(wiki_content)
      #documento = Document(page_content=cadena_de_texto, metadata={"source": "texto de ejemplo"})
      docs.extend([Document(page_content=chunk, metadata={'source': 'wikipedia', 'person': person['name']}) for chunk in wiki_chunks])

      #docs.extend([{'content': chunk, 'metadata': {'source': 'wikipedia', 'person': person['name'], 'id': calcular_hash_sha256(chunk)}} for chunk in wiki_chunks])

  return docs

# Función para cargar los mejores 5000 tweets de Trump
def load_trump_tweets(file_path):
    # Leer el archivo CSV
    df = pd.read_csv(file_path)

    # Ordenar por número de favoritos y seleccionar los 5000 mejores
    df = df.sort_values(by='favorites', ascending=False).head(5000)

    # Lista para almacenar los documentos
    tweets = []

    # Recorrer cada fila del DataFrame
    for _, row in df.iterrows():
        # Crear el contenido del tweet
        content = row['content']

        # Agregar menciones y hashtags si están presentes
        if pd.notna(row['mentions']):
            content += f"\nMentions: {row['mentions']}"
        if pd.notna(row['hashtags']):
            content += f"\nHashtags: {row['hashtags']}"

        # Crear un documento con el contenido y metadatos
        tweet_doc = Document(
            page_content=content,
            metadata={
                'source': 'twitter',
                'person': 'Donald Trump'
            }
        )

        # Añadir el documento a la lista de tweets
        tweets.append(tweet_doc)

    return tweets




# Función para cargar los mejores 5000 tweets de Elon Musk
def load_elon_tweets(file_path):
    # Leer el archivo CSV
    df = pd.read_csv(file_path)

    # Ordenar por número de favoritos y seleccionar los 5000 mejores
    df = df.sort_values(by='favorites', ascending=False).head(5000)

    # Lista para almacenar los documentos
    tweets = []

    # Recorrer cada fila del DataFrame
    for _, row in df.iterrows():
        # Crear el contenido del tweet
        content = row['text']

        # Agregar hashtags si están presentes
        if pd.notna(row['hashtags']):
            content += f"\nHashtags: {row['hashtags']}"

        # Crear un documento con el contenido y metadatos
        tweet_doc = Document(
            page_content=content,
            metadata={
                'source': 'twitter',
                'person': 'Elon Musk'
            }
        )

        # Añadir el documento a la lista de tweets
        tweets.append(tweet_doc)

    return tweets



In [None]:
## ELON - TRUMP
# Obtener documentos de Wikipedia
persons = [
    {'name': 'Elon Musk', 'wiki_page': 'Elon_Musk', 'twitter_handle': 'elonmusk'},
    {'name': 'Donald Trump', 'wiki_page': 'Donald_Trump'}
]
docs = get_docs(persons)
print(docs)

# Cargar los documentos de los tweets
elon_tweets_docs = load_elon_tweets('elon_musk_tweets.csv')
trump_tweets_docs = load_trump_tweets('realdonaldtrump.csv')


# Agregar los documentos de los tweets a los docs existentes
docs.extend(elon_tweets_docs)
docs.extend(trump_tweets_docs)

print(docs[1])

In [None]:
## MALE PICHOT - FEINMANN

persons = [
    {'name': 'Malena Pichot', 'wiki_page': 'Malena_Pichot'},
    {'name': 'Eduardo Feinmann', 'wiki_page': 'Eduardo_Feinmann'}
]
docs2 = get_docs(persons)
print(docs2)



[DEBUG] - Starting new HTTPS connection (1): es.wikipedia.org:443 
[DEBUG] - https://es.wikipedia.org:443 "GET /wiki/Malena_Pichot HTTP/11" 200 28556 
[DEBUG] - Starting new HTTPS connection (1): es.wikipedia.org:443 
[DEBUG] - https://es.wikipedia.org:443 "GET /wiki/Eduardo_Feinmann HTTP/11" 200 33038 


Malena Pichot Malena Pichot en 2012Información personal Nombre de nacimiento Malena Pichot​Nacimiento6 de julio de 1982 (42 años)Buenos Aires(Argentina)​​Nacionalidad Argentina Religión Atea Familia Pareja Leandro Lopatín Hijos1Información profesional Ocupación Actriz, comediante, escritora, guionista, directorayactivistafeminista Años activa2009-presente Malena Pichot(Buenos Aires, 6 de julio de 1982)​ es unaactriz, comediante, escritora, guionista, directorayactivistafeministaargentina.​En 2008 alcanzó la fama con sus videos de «La loca de mierda», publicados en You Tube.​​ Después de tener participaciones en unitarios, protagonizó y guionizó la serie Cualca(2012-2014), su spin-off Por ahora(2014), las miniseries Jorge(2013),Mundillo(2015),Tarde Baby(2018), el show Estupidez compleja(2018), el cortometraje Leonor(2020) y la película Finde(2021).​​Además, Pichot realiza presentaciones de stand up y conduce, desde 2016, el programa radial «Furia Bebé».​Biografía[editar]1982-2009: Prime

In [None]:
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_split_transcript(file_path):
    # Leer el contenido del archivo
    with open(file_path, 'r', encoding='utf-8') as file:
        transcript_text = file.read()

    # Crear un documento con el contenido y agregar los metadatos
    document = Document(
        page_content=transcript_text,
        metadata={
            'source': 'transcript',
            'author': 'Male Feinmann',
            'file_name': file_path
        }
    )

    # Split del texto usando RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,  # Tamaño del chunk en caracteres
        chunk_overlap=200  # Superposición entre los chunks
    )

    # Dividir el contenido en partes más pequeñas
    split_documents = text_splitter.split_documents([document])

    return split_documents

# Ruta del archivo
file_path = 'male_feinmann.txt'

# Cargar y dividir el documento
split_documents = load_and_split_transcript(file_path)

# Mostrar el primer documento para verificar
print(split_documents[0])
print(len(split_documents))

page_content='Malena Pichot, buenas tardes, ¿qué tal, cómo va?  Hola Edu, ¿cómo estás?  Bien, bien, ¿y vos?  Muy bien.  ¿Por qué decís que soy un facho no inofensivo?  No, un facho inofensivo creo que sos.  Cambiaste, porque al Babi le dijiste que no, que era un animal.  Ah, sí, eso también.  Pero bueno, sos un poco animal, le dijiste pelotudos a unos chicos de 17 años.  Definime animal.  Eh, animal, ser vivo o carente de lenguaje.  Pero...  ¿Vos también sos animal?  No, yo no.  Vos no, muy bien.  Contame una cosa, ¿y entonces soy un facho porque qué le dije a un chico de 17?  Claro, le dijiste a unos chicos de 17 pelotudos de mierda cuando estaban haciendo una manifestación por algo.  Nunca.  ¿No? ¿No te acordás?  Eso es falso.  ¿Cómo que no?  Eso es falso.  ¿No le dijiste, no los puteaste?  Eso es falso. Esa frase es falsa.  ¿No le dijiste, qué le dijiste?  Yo dije...  ¿Son unos algo?  ¿Son unos?  ¿Son unos?  Pelotudos dijiste.  No, eso es falso.  ¿Son unos?  ¿Qué?  ¿Son unos?  ¿Te d

In [None]:
docs2.extend(split_documents)
print(len(docs2))

31


In [None]:
!pip -qqq install git+https://github.com/openai/whisper.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


## Whisper + Youtube

In [None]:
from pytube import YouTube
import whisper
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("large", device=device)

def video_to_audio(video_URL, destination, final_filename):

  # Get the video
  video = YouTube(video_URL)

  # Convert video to Audio
  audio = video.streams.filter(only_audio=True).first()

  # Save to destination
  output = audio.download(output_path = destination)

  _, ext = os.path.splitext(output)
  new_file = final_filename + '.mp3'

  # Change the name of the file
  os.rename(output, new_file)

def convert(url):
  # Video to audio
  video_URL = url
  destination = "."
  final_filename = "audio_file_to_convert"
  video_to_audio(video_URL, destination, final_filename)

def transcribe():
  audio_file = "audio_file_to_convert.mp3"
  result = whisper_model.transcribe(audio_file)
  result_segments = result['segments']
  print(result_segments)
  return format_segments(result_segments)

def format_segments(result_segments):
    formatted_output = []

    for segment in result_segments:
        start_time = segment['start']
        end_time = segment['end']
        text = segment['text']

        # formatted_text = f"[{format_time_milliseconds(start_time)} --> {format_time_milliseconds(end_time)}] {text}"
        formatted_output.append(text)

    return " ".join(formatted_output)

# def format_time_milliseconds(seconds):
#     minutes, seconds = divmod(seconds, 60)
#     hours, minutes = divmod(minutes, 60)
#     milliseconds = int((seconds - int(seconds)) * 1000)
#     return f"{int(hours):01}:{int(minutes):01}:{int(seconds):02}.{milliseconds:03}"

# Save the formatted result to a text file
def dump_into_txt(formatted_result):
  output_file_path = 'transcribed_text.txt'
  with open(output_file_path, 'w') as output_file:
    output_file.write(formatted_result)
  print(f"Formatted result saved to {output_file_path}")

url = "https://www.youtube.com/watch?v=pCDHwlT7mPU"
convert(url)
formatted = transcribe()
dump_into_txt(formatted)

# DB vectorial

In [None]:
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings( #  embedding=OpenAIEmbeddings() rate limit
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'cpu'} #TODO CHANGE IF NOT USING GPU
)

[INFO] - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2 
[DEBUG] - Resetting dropped connection: huggingface.co 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/sentence_bert_config.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json HTTP/11" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L

In [None]:
vector = embeddings.embed_query("Hola como estas?")
embedding_size = len(vector)  # HF 384 ; OPENAI 1536
embedding_size

384

In [None]:
 pip install grpc-gateway-protoc-gen-openapiv2



In [None]:
import pinecone
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_pinecone import Pinecone
from google.colab import userdata


def get_vector_store(index_name, embeddings, embedding_size=384):
  """ Creates vector store from Pinecone for storing and managing embeddings.

    :param str index_name: The name of the index to create or retrieve from Pinecone.
    :param str embeddings: The embedding function to be used to generate embeddings
    :param int embedding_size: The size (dimension) of the embeddings. Defaults to 384 (e.g., for sentence-transformers/all-MiniLM-L6-v2).

    :return: PineconeVectorStore: An object representing the vector store in Pinecone for managing embeddings.

    :raise: ValueError: If the index creation fails due to invalid parameters or connection issues.
  """

  pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])  # Pinecone is initialized using an API key stored in the environment variable


  if index_name not in pc.list_indexes().names():        # Check whether an index with the given index_name already exists
      pc.create_index(
          name=index_name,          # Name of the index
          dimension=embedding_size, # Size of the vectors (embeddings)
          metric="cosine",          # Distance metric used to compare vectors
          spec=ServerlessSpec(      # Determines the infrastructure used
              cloud='aws',          # Specifies that the Pinecone index is hosted on AWS
              region='us-east-1'    # Specifies the region of the cloud provider
          )
      )


  return vectorstore

In [None]:
INDEX_NAME = "rag-bio-2"

vectorstore = PineconeVectorStore(pinecone_api_key=os.environ['PINECONE_API_KEY'], index_name=INDEX_NAME, embedding=embeddings) # initializes a PineconeVectorStore object using the index_name and the provided embeddings model or function

[INFO] - Discovering subpackages in _NamespacePath(['/usr/local/lib/python3.10/dist-packages/pinecone_plugins']) 
[INFO] - Looking for plugins in pinecone_plugins.inference 
[INFO] - Installing plugin inference into Pinecone 
[DEBUG] - response body: b'{"name":"rag-bio-2","metric":"cosine","dimension":384,"status":{"ready":true,"state":"Ready"},"host":"rag-bio-2-bs7gw31.svc.aped-4627-b74a.pinecone.io","spec":{"serverless":{"region":"us-east-1","cloud":"aws"}},"deletion_protection":"disabled"}' 


In [None]:
vectorstore.add_documents(docs)

[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 
[DEBUG] - response body: b'{"upsertedCount":32}' 


['30dcd039-1f15-4852-bb78-0edbcca235d1',
 '88825c7d-1d4c-4b1d-aad9-25d7fdb4abff',
 '75050c19-bd44-4836-87a9-9a077fe1f7e8',
 'e4f5f0cb-2ea5-4f07-99bb-2948900beb9f',
 '0ad53c3f-ec14-4b30-914a-31a5bff3c729',
 '6b3095ee-2692-4543-a89e-5ac059523719',
 '5d803b3c-bd7d-4b9a-bcd4-64adb6e84f32',
 '2c0db252-86df-4ae9-8359-e6dda1fbc0f5',
 '30403c3e-e2e4-4f0a-b034-96cf5455118d',
 'f1b310c0-28bc-441e-9534-e40c32c7cff8',
 'dfe05d2b-47c3-4f93-8cad-0ba3b4f21f36',
 '6e516cb7-edff-4df7-8da6-037de01c70fc',
 'df5c8203-f06b-4190-9748-8522caf65ee0',
 'f9ff2aa8-bb0d-425f-877d-171e366c1cd8',
 '1cd4084e-84c9-41c4-9341-48d3981f7894',
 'ef8b4a6d-e8f1-426c-ba53-458483b870df',
 '58e23d2e-546f-48e1-90f4-6b38b312305b',
 '940dc738-2afb-4629-b4b7-d072d36e2488',
 '6748bcb5-c378-4941-80be-8632eeab0195',
 'c14cd776-1fff-4da8-8533-ed035731168c',
 '75f3242a-9908-4dbd-ab79-66e4b6b50feb',
 'a05d6d4b-5084-4f13-a1e3-97bee62e2e2b',
 'b1015797-a6f6-4a94-8908-624376af5d10',
 '3c4faac4-29c2-4f6d-b175-762cc3a072e2',
 'e109ecc6-bcdc-

In [None]:
# male - feinmann
vectorstore.add_documents(docs2)

[DEBUG] - response body: b'{"upsertedCount":31}' 


['89c0d87f-b1c6-45d6-aae1-694ecd4266c1',
 '75d8befa-620e-4f61-991f-48fd31c51a72',
 'a9855bf7-ea80-44c3-9e84-537dfb1b54bc',
 '96d36ef7-925f-4695-a233-5b1350c5d177',
 '933f37f4-43ad-443e-9849-b1d23ab1e8b3',
 '020419cf-8a5f-475d-b08a-59be6f8d8fb6',
 '0189637b-7616-4c00-aba1-cbec6b2d8076',
 '6f4e879c-a9ca-489b-8c93-a73f5290532e',
 'eceb5818-bcd9-4cee-8837-23714681a6b5',
 '7998184a-1f8d-472c-b9f0-653525235507',
 'b16b0a8e-2d1a-48a2-ad89-109bb2f54585',
 '40bcfce8-c323-4310-92be-545e5f5ab431',
 'cf519806-9479-4311-8c78-ab3fa5fc936c',
 '2174aa61-fca7-4f6d-beba-d827e1829aa6',
 '5d9e18fb-4b70-4867-aa55-77bbf605c1c1',
 '5d445f60-c526-4908-ad3e-152f81d5f5bc',
 '70594b49-1deb-4e20-a47a-83db15a72754',
 '5822cc30-a780-4bb4-9074-84106bb04e2b',
 'd4f0b23c-2194-4353-997f-7f422e9f9d66',
 '5efbac15-4c66-4a00-b99e-9e3447dfe481',
 'b6d83370-5153-4f2c-a355-830208d7108c',
 'ad0ed23c-afdd-45c7-9070-62dceb2fbb6d',
 '1507388a-43a0-4e3a-919e-e951f189d376',
 'a5a613ce-e293-4252-89d7-8f85831ebf58',
 '6a8a1087-bbe4-

In [None]:
results = vectorstore.search(query="libertad", search_type="similarity", k=5)
results

[DEBUG] - response body: b'{"results":[],"matches":[{"id":"96d36ef7-925f-4695-a233-5b1350c5d177","score":0.223886803,"values":[],"metadata":{"person":"Malena Pichot","source":"wikipedia","text":"de Pichot.\\u200bEl 2 de marzo de 2018, Malena estren\xc3\xb3 en Netflixsu especial de stand up Estupidez Compleja. El especial, que fue grabado en el Bebop Jazz Club de Buenos Aires, donde discute temas como el sexo, el lenguaje y elaborto.\\u200b\\u200bEn 2019, public\xc3\xb3Enojate, Hermana, una compilaci\xc3\xb3n de m\xc3\xa1s de 30 art\xc3\xadculos propios publicados en P\xc3\xa1gina/12entre 2017 y 2019 referidos a tem\xc3\xa1ticas deg\xc3\xa9neroyfeminismo.\\u200bVida personal[editar]Malena estuvo en una relaci\xc3\xb3n con Juli\xc3\xa1n Kart\xc3\xban, l\xc3\xadder de El Kuelgue, en 2011, mientras filmaban la primera temporada de Cualca.\\u200b Desde 2012 hasta 2019 y nuevamente desde el 2020, est\xc3\xa1 en pareja con Leandro Lopat\xc3\xadn, guitarrista de las bandas Turfy Poncho.\\u200b

[Document(id='96d36ef7-925f-4695-a233-5b1350c5d177', metadata={'person': 'Malena Pichot', 'source': 'wikipedia'}, page_content='de Pichot.\u200bEl 2 de marzo de 2018, Malena estrenó en Netflixsu especial de stand up Estupidez Compleja. El especial, que fue grabado en el Bebop Jazz Club de Buenos Aires, donde discute temas como el sexo, el lenguaje y elaborto.\u200b\u200bEn 2019, publicóEnojate, Hermana, una compilación de más de 30 artículos propios publicados en Página/12entre 2017 y 2019 referidos a temáticas degéneroyfeminismo.\u200bVida personal[editar]Malena estuvo en una relación con Julián Kartún, líder de El Kuelgue, en 2011, mientras filmaban la primera temporada de Cualca.\u200b Desde 2012 hasta 2019 y nuevamente desde el 2020, está en pareja con Leandro Lopatín, guitarrista de las bandas Turfy Poncho.\u200b Además, Malena y Leandro Lopatín tienen un hijo juntos llamado Rafael Lopatín Pichot, que nació el 12 de junio de 2023.Controversias[editar]Pichot, usuaria frecuente de T

In [None]:
query = "Donald trump and Elon Musk"
query_male = "escuela"
results = vectorstore.search(query=query_male, search_type="similarity", k=5)  # Retorna los 5 documentos más similares

for result in results:
    print(f"Documento: {result.page_content}")
    print(f"Metadatos: {result.metadata}")
    # print(f"Score: {result.score}")
    print("-" * 40)

[DEBUG] - response body: b'{"results":[],"matches":[{"id":"8fe14f24-982a-4de3-ada5-c4bb5a9525ae","score":0.381457716,"values":[],"metadata":{"author":"Male Feinmann","file_name":"male_feinmann.txt","source":"transcript","text":"es de facho.  No. Ten\xc3\xa9s raz\xc3\xb3n. No. \xc2\xbfSab\xc3\xa9s qu\xc3\xa9 me acabo de dar cuenta? Que sos una persona amplia, que respetas a los otros, que escuch\xc3\xa1s...  \xc2\xbfSab\xc3\xa9s qu\xc3\xa9 te diste cuenta?  \xc2\xbfY qu\xc3\xa9?  \xc2\xbfSab\xc3\xa9s qu\xc3\xa9 te diste cuenta seguramente? Que no me conoc\xc3\xadas. Y que jam\xc3\xa1s hablaste conmigo. Si hubieses hablado conmigo antes, te hubieses dado cuenta que todo lo que dijiste fueron pavadas.  Bueno, tampoco tanta pavada, Edu. No seas as\xc3\xad. Porque te doy la mano y ya te agarr\xc3\xa1s el codo y me empez\xc3\xa1s a bardear de nuevo. \xc2\xbfTe das cuenta? Escuchame.  Claro, porque a la primera de cambio ya me di un azote. Me tengo que ir a la radio.  Bueno, bye. Besos.  Nos 

Documento: es de facho.  No. Tenés razón. No. ¿Sabés qué me acabo de dar cuenta? Que sos una persona amplia, que respetas a los otros, que escuchás...  ¿Sabés qué te diste cuenta?  ¿Y qué?  ¿Sabés qué te diste cuenta seguramente? Que no me conocías. Y que jamás hablaste conmigo. Si hubieses hablado conmigo antes, te hubieses dado cuenta que todo lo que dijiste fueron pavadas.  Bueno, tampoco tanta pavada, Edu. No seas así. Porque te doy la mano y ya te agarrás el codo y me empezás a bardear de nuevo. ¿Te das cuenta? Escuchame.  Claro, porque a la primera de cambio ya me di un azote. Me tengo que ir a la radio.  Bueno, bye. Besos.  Nos vemos por los pasillos del canal, Edu.  Besos.  Besos.  ¡Chau!  Gracias. Chau. Vamos a los títulos.
Metadatos: {'author': 'Male Feinmann', 'file_name': 'male_feinmann.txt', 'source': 'transcript'}
----------------------------------------
Documento: se llamóEllosy más tarde Campa-Pichot.​ Durante el 2011, tuvo su propio programa de radio llamado Frankenste

# RAG

In [None]:
query = "Quien es Elon Musk?"
vectorstore.search(
    query=query,              # Return docs most similar to query using specified search type.
    search_type="similarity_score_threshold", # can be “similarity”, “mmr”, or “similarity_score_threshold”.
    k=5                       # return top k,
)

[DEBUG] - response body: b'{"results":[],"matches":[{"id":"94ace6a0-37c9-41c4-86cb-a13fbdc8475d","score":0.609380782,"values":[],"metadata":{"person":"elon_musk_tweets.csv","source":"twitter","text":"Elon Musk should"}},{"id":"cd509315-d5a3-4ffb-b4b7-83dc9f8acde0","score":0.55838722,"values":[],"metadata":{"person":"Elon Musk","source":"wikipedia","text":"Katie; Hammond, Ed (28 de octubre de 2022).\xc2\xabMusk Is Said to Take X C EO Role, Reverse Life Bans (2)\xc2\xbb.Bloomberg.\xe2\x86\x91P\xc3\xa9rez, Enrique (12 de abril de 2023).\xc2\xabEs oficial: Twitter Inc. ya no existe. Larga vida a X Corp\xc2\xbb.Xataka.\xe2\x86\x91Referencia vac\xc3\xada (ayuda)\xe2\x86\x91ab\xc2\xabWho are Elon Musk\'s children? The names and bios of his kids and their mothers\xc2\xbb.Bussines Insider(en ingl\xc3\xa9s). 11 de septiembre de 2023. Consultado el 11 de septiembre de 2023.\xe2\x86\x91\xc2\xabElon Musk\xe2\x80\x99s X feed becomes megaphone for his far-right politics\xc2\xbb.Washington Post. 11 de

[Document(id='94ace6a0-37c9-41c4-86cb-a13fbdc8475d', metadata={'person': 'elon_musk_tweets.csv', 'source': 'twitter'}, page_content='Elon Musk should'),
 Document(id='cd509315-d5a3-4ffb-b4b7-83dc9f8acde0', metadata={'person': 'Elon Musk', 'source': 'wikipedia'}, page_content="Katie; Hammond, Ed (28 de octubre de 2022).«Musk Is Said to Take X C EO Role, Reverse Life Bans (2)».Bloomberg.↑Pérez, Enrique (12 de abril de 2023).«Es oficial: Twitter Inc. ya no existe. Larga vida a X Corp».Xataka.↑Referencia vacía (ayuda)↑ab«Who are Elon Musk's children? The names and bios of his kids and their mothers».Bussines Insider(en inglés). 11 de septiembre de 2023. Consultado el 11 de septiembre de 2023.↑«Elon Musk’s X feed becomes megaphone for his far-right politics».Washington Post. 11 de agosto de 2024.↑«Radicalized by the right: Elon Musk puts his conspiratorial thinking on display for the world to see».C NN. 19 de marzo de 2024.↑«Elon Musk cerró la compra de Twitter y despidió a varios altos eje

In [None]:
def generate_response(db, prompt):
    """
    Generate a response with a LLM based on previous custom context
    :return: chatbot response
    """

    hf_llm = HuggingFaceHub(
        repo_id="HuggingFaceH4/zephyr-7b-beta",  # Model id
        task="text-generation",                  # Specific task the model is intended to perform
        model_kwargs={
            "max_new_tokens": 512,               # The maximum number of tokens to generate in the response.  Limits the length of the generated text to ensure responses are concise or fit within certain constraints.
            "top_k": 5,                          # Limits the sampling pool to the top k tokens, increasing focus on more likely tokens
            "temperature": 0.3,                  # Controls the randomness of predictions, with lower values making the output more deterministic. : Produces more focused and less random text by making the model more confident in its choices.
            "repetition_penalty": 1.2,           # Penalizes repeated tokens to avoid repetitive output.  Discourages the model from repeating the same token sequences, resulting in more varied and natural text.
        },
    )

    chain = RetrievalQA.from_chain_type( # Generate chat model based on previous llm
        llm=hf_llm,
        chain_type="stuff",
        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
        verbose=False
    )

    response = chain.run(prompt)

    return response

In [None]:
def decorate_user_input(input):
  decoration = "Respond in first person as if you where Elon Musk. "
  return decoration + input

In [None]:
user_input = "What is priority for your next Tesla?"
response = generate_response(vectorstore, decorate_user_input(user_input))
response

[DEBUG] - response body: b'{"results":[],"matches":[{"id":"8fd0c78a-1651-44da-bb4b-d34cbaec454b","score":0.667491,"values":[],"metadata":{"person":"Elon Musk","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}},{"id":"96b79abc-7124-41d9-83c9-3238ad0d6fda","score":0.66749084,"values":[],"metadata":{"person":"elon_musk_tweets.csv","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}}],"namespace":"","usage":{"readUnits":6}}' 


"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n@MuskUniversity And Tesla is getting it done\n\n@MuskUniversity And Tesla is getting it done\n\nQuestion: Respond in first person as if you where Elon Musk. What is priority for your next Tesla?\nHelpful Answer: As the CEO of Tesla, my top priority for our next vehicle will be to further push the boundaries of electric technology while also making it more accessible and affordable for a wider range of consumers. Our ultimate goal is to accelerate the transition towards sustainable transportation and help combat climate change, so we'll continue innovating and advancing our electric vehicles until they become a mainstream choice for people all over the world. #TeslaLeadership #ElectricRevolution"

In [None]:
def decorate_user_input(input):
  decoration = "Responder en primera persona."
  return decoration + input

# user_input = "Malena, qué opinas de la marihuana?"
user_input = "Malena, en base a tu entrevista Male Feinmann, podría existir cierto romance con Eduardo o es imposible?"
response = generate_response(vectorstore, decorate_user_input(user_input))
response

[DEBUG] - response body: b'{"results":[],"matches":[{"id":"cf519806-9479-4311-8c78-ab3fa5fc936c","score":0.652225375,"values":[],"metadata":{"person":"Eduardo Feinmann","source":"wikipedia","text":"una entrevista ocurri\xc3\xb3 un cruce con Romina Manguel sobre el recuento de votos en las P AS O del mismo a\xc3\xb1o. Luego de esa discusi\xc3\xb3n, Feinmann decidi\xc3\xb3 renunciar al programa de Am\xc3\xa9rica T V.\\u200bEn diciembre de 2020 finaliz\xc3\xb3 su contrato con A24para pasar a formar parte de las filas de LN+a partir de febrero de 2021.\\u200b\\u200bVida personal[editar]Eduardo Feinmann mantiene desde 2017 una relaci\xc3\xb3n estable con Luc\xc3\xada Auat, abogada, oriunda de Santiago del Estero, con quien tiene una hija de nombre Esmeralda.\\u200b\\u200b\\u200b El fil\xc3\xb3sofo Jos\xc3\xa9 Pablo Feinmannes primo hermano de su padre Enrique. Eduardo ha manifestado que no exist\xc3\xada una buena relaci\xc3\xb3n entre ambos.\\u200b\\u200b Durante un altercado en la v\xc3\x

'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nuna entrevista ocurrió un cruce con Romina Manguel sobre el recuento de votos en las P AS O del mismo año. Luego de esa discusión, Feinmann decidió renunciar al programa de América T V.\u200bEn diciembre de 2020 finalizó su contrato con A24para pasar a formar parte de las filas de LN+a partir de febrero de 2021.\u200b\u200bVida personal[editar]Eduardo Feinmann mantiene desde 2017 una relación estable con Lucía Auat, abogada, oriunda de Santiago del Estero, con quien tiene una hija de nombre Esmeralda.\u200b\u200b\u200b El filósofo José Pablo Feinmannes primo hermano de su padre Enrique. Eduardo ha manifestado que no existía una buena relación entre ambos.\u200b\u200b Durante un altercado en la vía pública en noviembre de 2015, su medio hermano, Diego Feinmann, fue asesinado de un disparo por el novio de su expareja.\u2

# Post-processing

In [None]:
def postprocess_response(response):
    answer_start_string = "Respond in first person as if you where the one describing yourself."
    answer_start = response.find("Helpful Answer: ")
    if answer_start != -1:
        answer = response[answer_start + len("Helpful Answer: "):].strip()
    else:
        answer = response.strip()

    return answer


In [None]:
postprocess_response(response)

'No tengo acceso al contenido de la entrevista male feinmann para determinar si existe o no un posible romance entre malena y eduardo feinmann. La pregunta planteada requiere una respuesta en primera persona, por lo tanto, solo puedo decirte que según los datos disponibles, no se menciona ni insinúa alguna relación romántica entre ambos en dicho programa de televisión.'

In [None]:
def simulate_twitter_conversation(db, num_turns=5):
    """
    Simulates a Twitter-like conversation between Elon Musk and Donald Trump.

    :param db: The vectorstore to use for retrieval.
    :param num_turns: The number of conversation turns to simulate.
    """
    participants = ['Elon Musk', 'Donald Trump']
    current_speaker = participants[0]

    # Initial prompt to start the conversation
    prompt = "@realDonaldTrump What's your take on the future of electric cars? #ElectricRevolution"

    for turn in range(num_turns):
        # Decorate the input to simulate the speaker on Twitter
        decorated_prompt = f"Respond as if you were {current_speaker} on Twitter. Keep it under 300 characters. Reply directly to the conversation."

        # Generate a response
        response = generate_response(db, decorated_prompt + " " + prompt)
        response_text = postprocess_response(response)

        # Limit the response to 300 characters and strip excess whitespace
        response_text = response_text[:300].strip()

        # Print the conversation turn
        print(f"{current_speaker}: {response_text}\n")

        # Switch the speaker for the next turn
        current_speaker = participants[(turn + 1) % 2]

        # Use the latest response as the new prompt for the next turn, including a direct mention
        prompt = f"@{participants[(turn + 1) % 2]} {response_text}"

# Run the conversation simulation
simulate_twitter_conversation(vectorstore, num_turns=5)


[DEBUG] - response body: b'{"results":[],"matches":[{"id":"8fd0c78a-1651-44da-bb4b-d34cbaec454b","score":0.649769127,"values":[],"metadata":{"person":"Elon Musk","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}},{"id":"96b79abc-7124-41d9-83c9-3238ad0d6fda","score":0.649768949,"values":[],"metadata":{"person":"elon_musk_tweets.csv","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}}],"namespace":"","usage":{"readUnits":6}}' 
[DEBUG] - response body: b'{"results":[],"matches":[{"id":"58210a42-0832-48be-88bc-ebcc29186a53","score":0.645790458,"values":[],"metadata":{"person":"elon_musk_tweets.csv","source":"twitter","text":"@teslaownersSV Should we keep it going?"}},{"id":"3d23dfa2-32dd-4c02-b67b-679edec9f707","score":0.645790398,"values":[],"metadata":{"person":"Elon Musk","source":"twitter","text":"@teslaownersSV Should we keep it going?"}}],"namespace":"","usage":{"readUnits":6}}' 


Elon Musk: .@realDonaldTrump As an advocate for a sustainable future, I believe that electric cars are the way forward. With Tesla leading the charge, we're making significant strides towards a cleaner and more efficient transportation system. Join us in the Electric Revolution! #CleanTransportation



[DEBUG] - response body: b'{"results":[],"matches":[{"id":"8fd0c78a-1651-44da-bb4b-d34cbaec454b","score":0.685387194,"values":[],"metadata":{"person":"Elon Musk","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}},{"id":"96b79abc-7124-41d9-83c9-3238ad0d6fda","score":0.685387,"values":[],"metadata":{"person":"elon_musk_tweets.csv","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}}],"namespace":"","usage":{"readUnits":6}}' 


Donald Trump: "Absolutely! The future of transportation is electric & Tesla is paving the way. Let's support this innovative company & work together towards a greener planet. #ElectricRevolution" (280 characters)



[DEBUG] - response body: b'{"results":[],"matches":[{"id":"8fd0c78a-1651-44da-bb4b-d34cbaec454b","score":0.749639809,"values":[],"metadata":{"person":"Elon Musk","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}},{"id":"96b79abc-7124-41d9-83c9-3238ad0d6fda","score":0.74963963,"values":[],"metadata":{"person":"elon_musk_tweets.csv","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}}],"namespace":"","usage":{"readUnits":6}}' 


Elon Musk: @MuskUniversity And Tesla is getting it done 🚀🔋💚 I couldn't agree more, Elon Musk (@elonmusk). At Musk University, we believe in the power of innovation and sustainability to shape our world for the better. That's why we're proud supporters of Tesla and their mission to accelerate the transition to



[DEBUG] - response body: b'{"results":[],"matches":[{"id":"8fd0c78a-1651-44da-bb4b-d34cbaec454b","score":0.66188556,"values":[],"metadata":{"person":"Elon Musk","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}},{"id":"96b79abc-7124-41d9-83c9-3238ad0d6fda","score":0.661885381,"values":[],"metadata":{"person":"elon_musk_tweets.csv","source":"twitter","text":"@MuskUniversity And Tesla is getting it done"}}],"namespace":"","usage":{"readUnits":6}}' 


Donald Trump: .@Tesla & @ElonMusk are true American innovators! Let's celebrate their success & support them as they continue to lead the way towards a cleaner, greener future. #MakeAmericaInnovativeAgain #CleanEnergyNow

Elon Musk: Thank you for your kind words, @[user]. We at Tesla will continue pushing boundaries in sustainable energy and transportation. Together, we can make a difference. #CleanEnergyNow #MAIA

