In [1]:
from bs4 import BeautifulSoup
import requests

def extract_text_from(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)

In [2]:
import xmltodict

r = requests.get("https://www.finn.no/feed/job/atom.xml?rows=200")
xml = r.text
raw = xmltodict.parse(xml)

In [None]:
pages = []
for info in raw['feed']['entry']:
    url = info['link']['@href']
    if 'https://www.finn.no/' in url:
        pages.append({'text': extract_text_from(url), 'source': url})

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
docs, metadatas = [], []
for page in pages:
    splits = text_splitter.split_text(page['text'])
    docs.extend(splits)
    metadatas.extend([{"source": page['source']}] * len(splits))
    print(f"Split {page['source']} into {len(splits)} chunks")


In [3]:
import faiss
from langchain.chat_models  import ChatOpenAI
from langchain import OpenAI
from langchain.chains import VectorDBQAWithSourcesChain
#from langchain.chains import RetrievalQAWithSourcesChain
import pickle

with open("faiss_store.pkl", "rb") as f:
    store = pickle.load(f)

chain = VectorDBQAWithSourcesChain.from_llm(
#chain = RetrievalQAWithSourcesChain.from_llm(
            llm=OpenAI(temperature=0), vectorstore=store)



In [4]:
result = chain({"question": "Leter etter servitørjobb"})

print(f"Svar: {result['answer']}")
print(f"Annonser: {result['sources']}")

Svar:  There are several job postings for servitør positions in Norway.

Annonser: https://www.finn.no/312389367
https://www.finn.no/312407873
https://www.finn.no/312456004
https://www.finn.no/312424930


In [5]:
def finn_query(message, history=""):
    result = chain({"question": message})
    result_string = f"Svar: {result['answer']}\nAnnonser: {result['sources']}"
    return result_string

In [7]:
import gradio as gr

In [13]:
demo = gr.ChatInterface(finn_query, title="Søk etter stillinger på finn.no", description="Basert på 200 stillingsannonser fra finn.no (https://www.finn.no/feed/job/atom.xml?rows=200)."
, examples=["Jeg leter etter servitørjobb."
,"Jeg ser etter sykepleierstillinger."
,"Er det noen ledige lederstillinger ute?"])

demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7863
Running on public URL: https://8f2d1e106faa5be0d4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [14]:
gr.close_all()