# Basics

In [19]:
import os

import pinecone

from playground_secret_key import SECRET_KEY
from langchain.schema import (SystemMessage, HumanMessage, AIMessage)
from langchain.chat_models import ChatOpenAI

os.environ['OPENAI_API_KEY'] = SECRET_KEY
chat = ChatOpenAI(
    openai_api_key = os.environ['OPENAI_API_KEY'],
    model = 'gpt-3.5-turbo'
)

# messages = [
#     SystemMessage(content='You are a tutor that helps highschool students.'),
#     HumanMessage(content='Hi tutor, how are you today?'),
#     AIMessage(content='I am great, thank you, how can I help you today?.'),
#     HumanMessage(content='I would like you to explain to me second order derivatives')
# ]
# 
# # TODO : to have chat history you append both the AI response and the new prompt to the messages list
# 
# res = chat.invoke(messages)




# Chat history

In [20]:
# messages.append(res)
# prompt = HumanMessage(content='How does is this used in finding maxima and minima of a function')
# messages.append(prompt)
# res = chat.invoke(messages)
# print(res.content)


# Loading data

In [21]:
import glob
from langchain_community.document_loaders import DirectoryLoader
from pathlib import Path
from pathlib import Path
import glob

target_dir = '/Users/lorenzodeappolonia/Desktop/supervised_learning/to_do'
documents = DirectoryLoader(path=target_dir, glob='01_*.pdf', recursive=True, show_progress=True).load_and_split()


100%|██████████| 1/1 [00:01<00:00,  1.03s/it]


In [22]:
from pinecone import Pinecone 
from playground_secret_key import PINECONE_KEY

os.environ['PINECONE_API_KEY'] = PINECONE_KEY
environment = os.environ.get('PINECONE_ENVIRONMENT')

pc = Pinecone()



In [23]:
print(pc.list_indexes())

{'indexes': [{'dimension': 1536,
              'host': 'rag-xezwua8.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'rag',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}


In [24]:
# from pinecone import ServerlessSpec, PodSpec 
# import time
# index_name = "llama-2-rag"
# 
# if index_name not in pinecone.list_indexes().names():
#     pinecone.create_index(
#         index_name,
#         dimension=1536,
#         metric='cosine',
#         spec=PodSpec(environment="us-west1-gcp", pod_type="p1.x1")
# 
#     )
# 
#     while not pinecone.describe_index(index_name).status['ready']:
#         time.sleep(1)
#         
# index = pinecone.index(index_name)

In [25]:
index = pc.Index(name='rag')
print(index.describe_index_stats())



{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [26]:
from langchain_openai.embeddings import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model='text-embedding-ada-002')

In [27]:
# texts = ['this is the first chunk of text',
#          'then here is another chunk of text']
# 
# res = embed_model.embed_documents(texts)
# print(len(res), len(res[0]))
# print(documents[0].page_content)

In [28]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=500,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

documents = text_splitter.split_documents(documents)
print(documents[0].metadata)

{'source': '/Users/lorenzodeappolonia/Desktop/supervised_learning/to_do/01_basics_annotated.pdf'}


In [29]:
import time

batch_size = 500
i=0
vectors = []
j=0

for document in documents:
    i=0
    while i <= len(document.page_content):
        batch = document.page_content[i:batch_size]
        i += batch_size
        embeds = embed_model.embed_query(batch)
        vectors.append({'id' : f'{j}_{i}', 'values' : embeds, 'metadata': {'text':batch,'doc_type':'PDF', 'source': document.metadata['source']}})
    j+=1
        
index.upsert(vectors = vectors, namespace='ns1')
time.sleep(60)

    

# index.upsert(
#     vectors=[
#         {
#             "id": "vec1", 
#             "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], 
#             "metadata": {"genre": "drama"}
#         }, {
#             "id": "vec2", 
#             "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2], 
#             "metadata": {"genre": "action"}
#         }, {
#             "id": "vec3", 
#             "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3], 
#             "metadata": {"genre": "drama"}
#         }, {
#             "id": "vec4", 
#             "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4], 
#             "metadata": {"genre": "action"}
#         }
#     ],
#     namespace= "ns1"
# )


In [30]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00042,
 'namespaces': {'ns1': {'vector_count': 42}},
 'total_vector_count': 42}

In [31]:
from langchain_pinecone import PineconeVectorStore


vectorstore = PineconeVectorStore.from_existing_index('rag', embed_model)

query = 'Data'

res = vectorstore.similarity_search(query=query, namespace='ns1', k=1)
for el in res:
    print(el.page_content)

Michela Papandrea (SUPSI)

Introduction to Supervised Learning

8 / 25

Data representation

Despite the nature of the data, it is important to have a representation of your input data that a computer can understand commonly a dataset is representation as a table

row (or entry): each data point (or sample) that we want to reason about column: each property that describes that data point (features)

Michela Papandrea (SUPSI)

Introduction to Supervised Learning

9 / 25


In [32]:
def augmented_prompt(query: str):
    results = vectorstore.similarity_search(query=query, namespace='ns1', k=1)
    source_knowledge = '\n'.join([x.page_content for x in results])
    augmented_prompt = f"""Using the context below, answer the query. 
    
    Contexts: 
    {source_knowledge} 
    
    Query: 
    {query}"""
    return augmented_prompt

In [33]:
print(augmented_prompt(query))

Using the context below, answer the query. 
    
    Contexts: 
    Michela Papandrea (SUPSI)

Introduction to Supervised Learning

8 / 25

Data representation

Despite the nature of the data, it is important to have a representation of your input data that a computer can understand commonly a dataset is representation as a table

row (or entry): each data point (or sample) that we want to reason about column: each property that describes that data point (features)

Michela Papandrea (SUPSI)

Introduction to Supervised Learning

9 / 25 
    
    Query: 
    Data


In [34]:
prompt = HumanMessage(
    content=augmented_prompt('According to Michela Papandrea, what are the main steps of M.L. analysis')
)
messages = [prompt]
res = chat(messages)

print(res.content)

  warn_deprecated(


According to Michela Papandrea, the main steps of ML analysis are: 
1. Understand the problem we are trying to solve and if the data can solve the problem
2. Formalize the problem
3. Collect enough data to solve the problem
4. Identify features and algorithms which allow right predictions
5. Define metrics for the performance measurement
6. Generate the predictive model and integrate the ML solution within a business product


In [35]:
# pc.delete_index('rag')