In [7]:
! pip install "pinecone[grpc]" "langchain-pinecone"  "langchain-openai" "langchain-text-splitters" "langchain"

Defaulting to user installation because normal site-packages is not writeable


In [8]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo")

### setup

In [None]:
repo_name = "AlexBlazee/AmazonProductSearch"
branch_name = "main"

In [None]:
repo_name = "ModelEarth/feed"
branch_name = "main"

In [13]:
from langchain_community.document_loaders import GithubFileLoader
from pydantic import BaseModel
def get_readme (repo_name , branch_name , github_token):
    loader = GithubFileLoader(
        repo= repo_name, # Mention the repo we need 
        branch = branch_name ,  # the branch name
        access_token=github_token,
        github_api_url="https://api.github.com",
        file_filter=lambda file_path: file_path.endswith("README.md"), 
    )
    documents = loader.load()
    return documents

import re

def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)     # remove HTML
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)     # Remove URLs
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text) # Remove image references
    text = re.sub(r'[^\w\s.?!#]', '', text) # Remove emoticons while preserving basic punctuation
    text = re.sub(r'\s+', ' ', text).strip()         # Remove extra whitespace
    return text

### pinecone vector database setup

In [43]:
from pinecone.grpc import PineconeGRPC as Pinecone
from tqdm.autonotebook import tqdm
from pinecone import ServerlessSpec
api_key = os.environ.get('PINECONE_ACCESS_TOKEN') 
pc = Pinecone(api_key=api_key)

index_name = "github-rag-db"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536, 
        metric="dotproduct", 
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    ) 


In [44]:
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

### collection and embedding into vector store

In [61]:
from langchain.document_loaders import TextLoader
from langchain.document_loaders.web_base import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Use WebBaseLoader instead of TextLoader to load content from a URL
loader = WebBaseLoader("https://raw.githubusercontent.com/ModelEarth/feed/main/README.md")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
for doc in docs:
    url = doc.metadata['source']
    parts = url.split('/')
    doc.metadata = {'source': url , 'owner': parts[3] ,'repo': parts[4] , 'branch': parts[5], 'filename': '/'.join(parts[6:])}
# preprocessed_text = preprocess_text(documents[0].page_content)

In [62]:
docs

[Document(metadata={'source': 'https://raw.githubusercontent.com/ModelEarth/feed/main/README.md', 'owner': 'ModelEarth', 'repo': 'feed', 'branch': 'main', 'filename': 'README.md'}, page_content='Feed Player + Swiper - For Images, Video\xa0and\xa0Text\n\n\nWelcome to our Feed-Player React Project! This project provides a modern and user-friendly interface for viewing a series of images and video pulled from RSS, JSON, CSV and YAML. The UI is built using Vite, ReactJS, HTML, CSS, and JavaScript. The Feed-Player is designed to be fully responsive and packed with a range of features to enhance your viewing experience with filmstrip navigation using [swiper](https://github.com/modelearth/swiper).\n\n\n\n[Check out the dist preview](dist/) of the Feed-Player project on model.earth.\n\n## Feed Samples\n\n[View Feeds](view) - The Feed Player is being designed to convert APIs, JSON and .CSV into video-like presentations.\n[Bluesky RSS Feeds](view/#feed=bsky) - Click "Turn on CORS passthrough". 

In [63]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model = "text-embedding-3-small" , api_key= os.environ.get('OPENAI_API_KEY'))
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings , pinecone_api_key= os.environ.get('PINECONE_ACCESS_TOKEN'))
vectorstore.add_documents(docs)

['0a24f29e-ab4d-4150-8315-0a6c8c6afbf3',
 '44b32ab9-3ffb-4b28-8f3a-aedd36e8dd2d',
 '99090e03-30ad-4e08-aa69-dd5a742a2b4d',
 '138f66c2-dfb6-4b7e-abf0-fae51b244ca7',
 '8f9b83a8-9659-42f9-b11b-43ae33b9cd12',
 'b24f68f5-4eb9-4d08-a56c-85cf37d29b65',
 '8d9c0409-08a4-4482-a45d-4be33be5c888',
 '7608a009-ea24-4686-8da8-db660467fbe8',
 '50ebb998-74f9-455e-97e9-c25de2f6e493',
 '2e4d1e8b-2b26-41da-b77c-879474517fe6',
 '2fddcd2b-6856-4867-9e29-3a5c164bf218',
 '5e5cee6c-54d1-4197-93e2-bba002f1cc0f',
 '55454303-4862-4165-8c14-a829bf06228b']

In [52]:
query = "under what license is the project under?"
vectorstore.similarity_search(query, filter={'repo': 'feed'})

[Document(id='be1e0300-d681-484d-a6a1-c00777eda937', metadata={'branch': 'main', 'filename': 'README.md', 'owner': 'ModelEarth', 'repo': 'feed', 'source': 'https://raw.githubusercontent.com/ModelEarth/feed/main/README.md'}, page_content='Vite is preferable to Create React App (CRA) because Vite does not rebuild the whole app whenever changes are made. It splits the app into two categories: dependencies and source code. Dependencies do not change often during the development process, so they are not rebuilt each time thanks to Vite.\n\n## Contributions\n\nContributions to the [Feed-Player Github Repo](https://github.com/modelearth/feed/) are welcome! If you have any improvements, bug fixes, or additional features in mind, feel free to fork this repository, make your changes, and submit a pull request.\n\n## License\n\nThis project is licensed under the [MIT License](https://github.com/ModelEarth/feed/blob/main/LICENSE),  \nwhich means you are free to use, modify, and distribute the code

In [65]:
from langchain.chains import RetrievalQA  
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(
    openai_api_key=os.environ.get('OPENAI_API_KEY'),
    model_name='gpt-3.5-turbo',
    temperature=0.3
)

# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=vectorstore.as_retriever()
# )

retriever = vectorstore.as_retriever(filter={'repo': 'feed'} )
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [66]:
query1 = """what is the purpose of feed player"""
rag_chain.invoke(query1)

'The purpose of the Feed-Player is to provide a modern and user-friendly interface for viewing a series of images and videos pulled from RSS, JSON, CSV, and YAML sources. It is designed to be fully responsive and packed with features to enhance the viewing experience, including filmstrip navigation using swiper. Users can convert APIs, JSON, and .CSV into video-like presentations with the Feed Player.'

In [60]:
query2 = """what are the features of the project"""
rag_chain.invoke(query2)

'The features of the project include play/pause functionality, stop button, volume control, mute option, full-screen mode, remaining time display, navigation between playlist items, and the ability to play by URL. The new UI and controls are visually appealing and intuitive, designed for easy access to functionalities. The project involves team collaboration for updates such as implementing new controls and features like a progress bar and making the player an embeddable widget.'

In [68]:
query = "under what license is the project?"
rag_chain.invoke(query)

'The project is licensed under the MIT License, which allows users to freely use, modify, and distribute the code.'

In [None]:
# TODO : Sessions in chat implementation
# TODO : multiple DB implementation , modularized code
# TODO : multiple repo processing 