In [1]:
!pip install langchain faiss-cpu transformers bs4 -U langchain-community



In [2]:
!python3 --version

Python 3.12.2


## Web-scraping information and writing to file

In [47]:
import requests
from bs4 import BeautifulSoup

URL = "https://smartasset.com/investing/stock-correlation"

r = requests.get(URL)
#print(r.content)

soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')
paragraph_texts = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]

filename = "corr.txt"
with open(f"/kaggle/input/knowledge_base/{filename}", 'w', encoding = 'utf-8') as file:
  for paragraph in paragraph_texts:
    file.write(paragraph + "\n\n")


## For Kaggle

In [49]:
import requests
from bs4 import BeautifulSoup

URL ="https://smartasset.com/investing/stock-correlation"

r = requests.get(URL)
#print(r.content)

soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')
paragraph_texts = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]

filename = "corr.txt"
with open(f"/kaggle/working/{filename}", 'w', encoding = 'utf-8') as file:
  for paragraph in paragraph_texts:
    file.write(paragraph + "\n\n")

## Appending to file

In [None]:
import requests
from bs4 import BeautifulSoup

URL =input("Enter URL to scrape: ")

r = requests.get(URL)
print(r.content)

soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')
paragraph_texts = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]

filename = input("Enter relevant file name: ")
with open(f"/content/{filename}", 'a', encoding = 'utf-8') as file:
  for paragraph in paragraph_texts:
    file.write(paragraph + "\n\n")

In [3]:

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_paths = [
    "Knowledge_Base/candlestick.txt",
    "Knowledge_Base/ma.txt",
    "Knowledge_Base/momentum.txt",
    "Knowledge_Base/rsi.txt",
    "Knowledge_Base/bollinger.txt",
    "Knowledge_Base/corr.txt",
    "Knowledge_Base/cumul.txt",
    "Knowledge_Base/macd.txt"
]

documents = []
for file_path in file_paths:
    loader = TextLoader(file_path)
    documents.extend(loader.load())

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

ModuleNotFoundError: No module named 'langchain'

In [51]:
from langchain.embeddings import HuggingFaceEmbeddings

# Use sentence-transformers for generating embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [52]:
from langchain.vectorstores import FAISS

# Use a FAISS vector database to store document embeddings
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [54]:
class QAPipelineWrapper:
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.task = "question-answering"

    def __call__(self, prompt, **kwargs):
        # Format the input for the QA pipeline
        question = kwargs.get("question", "What is Momentum?")  # Default question if not provided
        context = prompt  # The prompt contains the context
        # Clean the context to remove any special characters or formatting
        context = self._clean_text(context[0])
        
        # Format the input for the QA pipeline
        inputs = {
            "question": question,
            "context": context
        }
        return self.pipeline(inputs)["answer"]

    def _clean_text(self, text):
        text = text.replace("\n", " ").replace("\t", " ")
        text = "".join(char for char in text if ord(char) < 128)
        return text

In [61]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA


model_name = "google/flan-t5-large"  # You can choose other models
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the pipeline
hf_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,  # Max token length for output
    num_beams=5,     # Beam search for better answers, change from 3
    temperature = 0.5,
    do_sample = True,
    top_p = 0.9 #nucleus sampling
)

# Wrap the pipeline for LangChain
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Use your retriever (assuming `vectorstore` is already defined)
retriever = vectorstore.as_retriever()

# Create a RetrievalQA chain
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="refine",  # "map_reduce", "refine", "map_rerank", etc.
    retriever=retriever
)

question = "Can you explain momentum in absolute detail?"
response = rag_pipeline.run(question)
print(f"Q: {question}\nA: {response}")

Device set to use cuda:0


Q: Can you explain momentum in absolute detail?
A: Momentum investing is primarily a short-term strategy. It focuses on identifying stocks that are currently trending upward and buying them with the expectation that the trend will continue in the short term. As soon as the momentum starts to fade or reverse, the investor sells the stock to capture the short-term gains. To calculate a stock's momentum, you can use the following formula: Momentum = Current Price / Price X months ago


In [63]:
vectorstore.save_local("/kaggle/working/retriever_data/")
print("Vectorstore saved successfully!")


save_directory = "/kaggle/working/generative_model"
pipeline = llm.pipeline 

pipeline.model.save_pretrained(save_directory)
pipeline.tokenizer.save_pretrained(save_directory)
print("Generative model and tokenizer saved successfully!")

import json

# Save the configuration of the RAG pipeline
rag_config = {
    "chain_type": "stuff",
    "retriever_path": "/kaggle/working/retriever_data", 
    "model_path": "/kaggle/working/generative_model",  
    "tokenizer_path": "/kaggle/working/generative_model"  
}


with open("/kaggle/working/rag_config.json", "w") as f:
    json.dump(rag_config, f)

print("RAG pipeline configuration saved successfully!")


Vectorstore saved successfully!
Generative model and tokenizer saved successfully!
RAG pipeline configuration saved successfully!


In [62]:
question = "What is MACD ?"
response = rag_pipeline.run(question)
print(f"Q: {question}\nA: {response}")

Q: What is MACD ?
A: MACD full form: Moving Average Convergence Divergence and is one of the most widely used momentum indicators in technical analysis. Gerald Appel was the creator of this indicator at the end of the 1970s. By computing the distinction between two time period intervals, which are a compilation of historical time series, this indicator is used to define momentum and its directional resilience. MACD uses moving averages of two distinct time intervals (most commonly historical closing prices of ------------


In [None]:
import json
from langchain.chains import RetrievalQA
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import FAISS

# Load the RAG configuration
with open("/kaggle/working/rag_config.json", "r") as f:
    rag_config = json.load(f)

# Load the vector store
vectorstore = FAISS.load_local(rag_config["retriever_path"])

# Load the model and tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(rag_config["model_path"])
tokenizer = AutoTokenizer.from_pretrained(rag_config["tokenizer_path"])

# Create the QA pipeline
hf_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Wrap it in LangChain's HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Initialize the retriever
retriever = vectorstore.as_retriever()

# Recreate the RAG pipeline
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=rag_config["chain_type"],  # Use the saved chain type
    retriever=retriever
)

# Ask a question
response = rag_pipeline.run("What is Momentum?")
print(response)