In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import openai
from dotenv import load_dotenv, find_dotenv
import os
import sys

sys.path.append("../..")


_ = load_dotenv(find_dotenv())  # read local .env file

HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]

openai.api_key = os.environ["OPENAI_API_KEY"]
ELASTIC_PASSWORD = os.environ["ELASTIC_PASSWORD"]

In [3]:
import langchain

langchain.debug = False

In [4]:
from langchain_community.embeddings import (
    HuggingFaceInferenceAPIEmbeddings,
    OpenAIEmbeddings,
)

embeddings_hf = HuggingFaceInferenceAPIEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",
    api_key=HUGGINGFACEHUB_API_TOKEN,
)

embeddings_openai = OpenAIEmbeddings()

  warn_deprecated(


In [6]:
import asyncio
from pyppeteer import launch
from bs4 import BeautifulSoup
import nest_asyncio
page = None


async def main(url):
    global page
    browser = await launch()
    page = await browser.newPage()
    await page.goto(url)

    # Get the page content after JavaScript execution
    page_content = await page.content()

    # Close the browser
    await browser.close()

    # Now you can use BeautifulSoup to parse the page content
    soup = BeautifulSoup(page_content, 'html.parser')

    # Perform scraping using BeautifulSoup on the dynamically rendered content
    # ...return soup
    # print(soup.prettify())
    return soup

nest_asyncio.apply()
url = "https://www.mdcsoftware.com.vn/"
soup = asyncio.get_event_loop().run_until_complete(main(url))

In [None]:
from bs4 import BeautifulSoup

recruiment = BeautifulSoup()
block = "MDC SoftwareYou need to enable JavaScript to run this app."

In [8]:
while not recruiment.text or recruiment.text == block:
    recruiment = asyncio.get_event_loop().run_until_complete(main(url + "recruitment"))

In [11]:
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
    ("h5", "Header 5"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

html_header_splits = html_splitter.split_text(
    soup.text[:12] + soup.text[58:] + "\n" + recruiment.text[113:-225])

chunk_size = 200
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

docs = text_splitter.split_documents(html_header_splits)

In [None]:
from agentic_chunking import get_propositions

propositions = get_propositions(
    soup.text[:12] + soup.text[58:] + "\n" + recruiment.text[113:-225])

In [5]:
propositions = ['About Us: MDC Software is a technology company specializing in product design and development for high growth companies.',
                'Services and Products: MDC Software offers services such as web application development, artificial intelligence, internet of things, UX design, streaming solutions, product marketing, payment integration, blockchain, technical advice, and more. They also have developed products such as iVPN, iMatch, Advanced IPTV Player, and Can Knockdown.',
                'Address: MDC VIETNAM MEDIA AND TECHNOLOGY COMPANY LIMITED, 15th Floor, Viet A Building, No. 9 Duy Tan, Cau Giay, Hanoi',
                'Contact: 0868.733.900, hello@mdcsoftware.com.vn',
                'Recruitment: MDC Media is currently recruiting front-end react native developers and freshers for new projects. The job description includes JavaScript programming, participation in project development, and application development on platforms such as React Native and ReactJS.',
                "Mission: MDC Software's mission is to create new value for customers and partners through technology solutions.",
                'FAQ: Do you intend to find companies writing mobile applications? MDC Media is working in Hanoi City specializing in mobile development and web design for individuals, companies, and startups.']

## FAISS db


In [None]:
from langchain_community.vectorstores import FAISS

db = await FAISS.afrom_texts(propositions, embeddings_openai)

## Elastic Search


In [8]:
ELS_API_KEY = os.environ["ELS_API_KEY"]
ELS_CLOUD_ID = os.environ["ELS_CLOUD_ID"]

In [40]:
from elasticsearch import Elasticsearch
from langchain_elasticsearch import ElasticsearchStore

# es_client = elasticsearch.Elasticsearch(
#     cloud_id=ELS_CLOUD_ID,
#     api_key=ELS_API_KEY,
#     max_retries=5,
# )
es = Elasticsearch(
    "https://localhost:9200",
    # hosts = "https://51.89.155.81:80",
    ca_certs="./ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD),
    max_retries=5,
)
vectorstore = ElasticsearchStore(
    index_name="mdc", es_connection=es, embedding=embeddings_openai
)

In [41]:
es.info()

ObjectApiResponse({'name': 'es01', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'EnYfC3zPQwq7sOTNroFG3A', 'version': {'number': '8.12.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '48a287ab9497e852de30327444b0809e55d46466', 'build_date': '2024-02-19T10:04:32.774273190Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [15]:
vectorstore.add_texts(propositions)

['668ef46b-dfd8-4e3f-8240-ccfecd315573',
 'a96158cc-8009-4e22-be5e-7044d9362590',
 '72ace49b-ee51-4263-823d-c3a76cd742db',
 '4103ed46-6d8c-4dbd-8d82-87442ebfe4a7',
 'f92d6d95-f08c-4034-92fc-cd2a224badd3',
 'f3a5c945-efe3-40c3-81e9-38b9e0e5c107',
 'a90a803a-8701-41cc-9361-eb12b9e9ddf7']

In [17]:
from langchain.chat_models import ChatOpenAI

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import (
    RunnablePassthrough,
)
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 2})

template = """You're helpful assistant, please answer the question based only on the following context:
{context}
Your response based on user's question language.
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(
    model_name="gpt-3.5-turbo-1106",
    temperature=0,
    callbacks=[StreamingStdOutCallbackHandler()],
    streaming=True,
)
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [18]:
chain.invoke("Can I get a job at MDC Software?")

Based on the information provided, MDC Software is currently recruiting front-end react native developers and freshers for new projects. If you have the required skills and qualifications, you may be able to apply for a job at MDC Software.

'Based on the information provided, MDC Software is currently recruiting front-end react native developers and freshers for new projects. If you have the required skills and qualifications, you may be able to apply for a job at MDC Software.'