In [48]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl (1.0 MB)
Collecting langchain-core<1.0.0,>=0.3.66
  Downloading langchain_core-0.3.66-py3-none-any.whl (438 kB)
Collecting langsmith>=0.1.17
  Downloading langsmith-0.4.4-py3-none-any.whl (367 kB)
Collecting async-timeout<5.0.0,>=4.0.0
  Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl (32 kB)
Collecting jsonpatch<2.0,>=1.33
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting orjson<4.0.0,>=3.9.14
  Downloading orjson-3.10.18-cp310-cp310-win_amd64.whl (134 kB)
Collecting requests-toolbelt<2.0.0,>=1.0.0
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
Collecting zstandard<0.24.0,>=0.23.0
  Downloading zstandard-0.23.0-cp310-cp310-win_amd64.whl (495 kB)
Installing collected packages: zstandard, requests-toolbelt, orjson, langsmith, jsonpatch, langchain-core, langchai



In [49]:
import numpy as np
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from langchain_text_splitters import RecursiveCharacterTextSplitter
import math


In [52]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def chunk_text(text, chunk_size):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
    chunks = text_splitter.split_text(text)

    return chunks

def get_embedding_model():
    embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    return embed_model

def get_embeddings(embed_model, text: str):
    embeddings = embed_model.get_text_embedding(text)
    return embeddings

def dot_product(vec1, vec2):
    return sum(a * b for a, b in zip(vec1, vec2))

def magnitude(vec):
    return math.sqrt(sum(v**2 for v in vec))

def cosine_similarity(vec1, vec2):
    dot_prod = dot_product(vec1, vec2)
    mag_vec1 = magnitude(vec1)
    mag_vec2 = magnitude(vec2)

    if mag_vec1 == 0 or mag_vec2 == 0:
        return 0  # Handle division by zero

    return dot_prod / (mag_vec1 * mag_vec2)

In [57]:
text_file = read_text_file("crosve.txt")
chunks = chunk_text(text_file, chunk_size=220)
print(chunks)


["I'm a recent graduate from hunter college. I have a bachelors in computer science with a minor in math. I'm currently working with TipTop technologies as", "a software engineer. I'm currently working on a passion project as well called csphere to help people stay ontop of their bookmarks and have them", "actually revisit them rather then pilling them up. I gym, run, like to explore coffee chops around nyc. Kinda of a coffee adict so if you lvoe coffee we'll get along just fine.", "My inetrest align in development and ai as I've always been fascinated by ai when I was growing up. I'm open to any jobs within the tech sector and would love to", "have a quick chat to see if a potential role aligns with my career intrests. For now I'm just going with the flow, making myself 1 percent better day by day until I become the", 'engineer that I know I can become. Some of my favorite shows include one piece, suits, and any drama show ou can imagine.']


In [None]:

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from pinecone import Pinecone, PodSpec
from llama_index.llms.google_genai import GoogleGenAI


pc = Pinecone(api_key="")


llm = GoogleGenAI(
    model="gemini-2.0-flash",
    api_key="",  # uses GOOGLE_API_KEY env var by default
)


In [59]:
# embed_model = get_embedding_model()
n = len(chunks)
print("size of chunks: ", n)
records = []
for i in range(1, n + 1):
    data = {
        '_id' : f'rec{i}',
        'chunk_text' : chunks[i - 1]

    }
    records.append(data)

print(records)





size of chunks:  6
[{'_id': 'rec1', 'chunk_text': "I'm a recent graduate from hunter college. I have a bachelors in computer science with a minor in math. I'm currently working with TipTop technologies as"}, {'_id': 'rec2', 'chunk_text': "a software engineer. I'm currently working on a passion project as well called csphere to help people stay ontop of their bookmarks and have them"}, {'_id': 'rec3', 'chunk_text': "actually revisit them rather then pilling them up. I gym, run, like to explore coffee chops around nyc. Kinda of a coffee adict so if you lvoe coffee we'll get along just fine."}, {'_id': 'rec4', 'chunk_text': "My inetrest align in development and ai as I've always been fascinated by ai when I was growing up. I'm open to any jobs within the tech sector and would love to"}, {'_id': 'rec5', 'chunk_text': "have a quick chat to see if a potential role aligns with my career intrests. For now I'm just going with the flow, making myself 1 percent better day by day until I become th

In [60]:
index_name = "crosve-portfolio"

pinecone_index = pc.Index(index_name)

In [61]:
pinecone_index.upsert_records("portfolio", records)

In [62]:
# Wait for the upserted vectors to be indexed
import time
time.sleep(10)

# View stats for the index
stats = pinecone_index.describe_index_stats()
print(stats)

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'example-namespace': {'vector_count': 4},
                'portfolio': {'vector_count': 6}},
 'total_vector_count': 10,
 'vector_type': 'dense'}


In [64]:
query = "What does crosve like to do when he has free time?"
reranked_results = pinecone_index.search(
    namespace="portfolio",
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 10,
        "rank_fields": ["chunk_text"]
    }   
)

content = []

for hit in reranked_results['result']['hits'][:4]:
  
    content.append(hit['fields']['chunk_text'])

    print(f"id: {hit['_id']}, score: {round(hit['_score'], 4)}, text: {hit['fields']['chunk_text']} \n")


id: rec3, score: 0.0009, text: actually revisit them rather then pilling them up. I gym, run, like to explore coffee chops around nyc. Kinda of a coffee adict so if you lvoe coffee we'll get along just fine. 

id: rec6, score: 0.0001, text: engineer that I know I can become. Some of my favorite shows include one piece, suits, and any drama show ou can imagine. 

id: rec2, score: 0.0, text: a software engineer. I'm currently working on a passion project as well called csphere to help people stay ontop of their bookmarks and have them 

id: rec5, score: 0.0, text: have a quick chat to see if a potential role aligns with my career intrests. For now I'm just going with the flow, making myself 1 percent better day by day until I become the 



In [69]:
prompt = f"You are a smart agent. A question would be asked to you and relevant information would be provided.\
    Your task is to answer the question and use the information provided. Question - {query}. Relevant Information about crosve lucero - {[c for c in content]}. You may also use \
    any addional information you may find on the web to help you answer the question as well. Also answer the questions(s) like it were any regular conversation and sounds enthustiastic as well \
        Don't start off by saying 'based on the info provided' or anything like that."

print(prompt)

You are a smart agent. A question would be asked to you and relevant information would be provided.    Your task is to answer the question and use the information provided. Question - What does crosve like to do when he has free time?. Relevant Information about crosve lucero - ["actually revisit them rather then pilling them up. I gym, run, like to explore coffee chops around nyc. Kinda of a coffee adict so if you lvoe coffee we'll get along just fine.", 'engineer that I know I can become. Some of my favorite shows include one piece, suits, and any drama show ou can imagine.', "a software engineer. I'm currently working on a passion project as well called csphere to help people stay ontop of their bookmarks and have them", "have a quick chat to see if a potential role aligns with my career intrests. For now I'm just going with the flow, making myself 1 percent better day by day until I become the"]. You may also use     any addional information you may find on the web to help you answ

In [71]:
from llama_index.llms.google_genai import GoogleGenAI


llm = GoogleGenAI(
    model="gemini-2.0-flash",
    api_key="AIzaSyCFkoBKhOg22JYeaCu4X9E91y6skg1GuYc", 
)

response = llm.complete(prompt)
print(response.text)

Oh, Crosve sounds like a cool person! When he's got some free time, it looks like he enjoys hitting the gym, going for runs, and exploring coffee shops around NYC – he's a self-proclaimed coffee addict! He's also working on a passion project called csphere and enjoys watching shows like One Piece and Suits.

