In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date
from weaviate.util import generate_uuid5

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter



# Get the links of the articles

In [6]:
headers = {}
links = []
dates = []
page = 1

base_url = "https://www.astronomer.io"
page_url = base_url + "/blog/{page}/#archive"
blog_cutoff_date = date(2023,1,19)

response = requests.get(page_url.format(page=page), headers=headers)

while response.ok:
    soup = BeautifulSoup(response.text, "lxml")
    cards = soup.find_all(class_="post-card__cover")
    card_links = [base_url + card.find("a", href=True)["href"] for card in cards]
    links.extend(card_links)
    
    meta = soup.find_all(class_="post-card__meta")
    dates.extend([post.find("time")["datetime"] for post in meta])

    page = page + 1
    response = requests.get(page_url.format(page=page), headers=headers)


## Format content into a pandas DF


In [7]:
df = (
    pd.DataFrame(zip(links, dates), columns=["docLink", "date"])
    .assign(date=lambda df: pd.to_datetime(df["date"]).dt.date)
    .loc[lambda df: df["date"] > blog_cutoff_date]
    .drop("date", axis=1)
    .drop_duplicates()
    .assign(
        raw_content = lambda df: [requests.get(link).content for link in df.docLink],
        title= lambda df: [BeautifulSoup(content, "lxml").find("h1").text for content in df.raw_content],
        text_content= lambda df: [BeautifulSoup(content, "lxml").find(class_="prose").get_text().replace("{{","{").replace("}}","}") for content in df.raw_content],
        content = lambda df:  "# " + df["title"] + "\n\n## " + df["text_content"],
        docSource = "astro blog",    
        sha = lambda df: [generate_uuid5(content) for content in df.content]
    )
    .reset_index(drop=True)
    .loc[:, ["docSource","sha","content", "docLink"]]
)

df

Unnamed: 0,docSource,sha,content,docLink
0,astro blog,436b9648-1c86-5c86-8a90-1c251d578774,# Introducing Apache Airflow™ on Astro – an Az...,https://www.astronomer.io/blog/introducing-apa...
1,astro blog,4017cd9d-86f5-5466-b7e5-e5a118bd290f,# Apache Airflow TaskFlow API vs. Traditional ...,https://www.astronomer.io/blog/apache-airflow-...
2,astro blog,f50b4035-7e41-58ed-8849-c1264bb90731,# Orchestrating Feature Pipelines: Announcing ...,https://www.astronomer.io/blog/orchestrating-f...
3,astro blog,bc361e11-6a60-5003-bb63-979247bc113d,# Ask Astro: Operationalizing Data Ingest for ...,https://www.astronomer.io/blog/ask-astro-opera...
4,astro blog,73f6958c-0a1a-529c-a54b-01398ea49a6d,# Using Astronomer’s new Cosmos to deploy dbt ...,https://www.astronomer.io/blog/using-astronome...
5,astro blog,bbc8a530-73d6-5bc0-8a92-64dff1f18725,# Databricks vs. Airflow From a Management Per...,https://www.astronomer.io/blog/databricks-vs-a...
6,astro blog,caff2267-6c9a-5f11-ab2d-23f8d0c51cb4,# Migrate Python Jobs to Airflow in 4 Simple S...,https://www.astronomer.io/blog/migrate-python-...
7,astro blog,21878f4e-777c-5206-8e35-5491b2e9a4e9,# 3 Key Takeaways from Airflow Summit 2023\n\n...,https://www.astronomer.io/blog/3-Key-Takeaways...
8,astro blog,ff464c60-59a1-5365-a256-2aa68cf4b4e5,# Ask Astro: An open source LLM Application wi...,https://www.astronomer.io/blog/ask-astro-open-...
9,astro blog,af83215c-c624-5cd8-96de-cb3358aaf61a,# Day 2 Operations for LLMs with Apache Airflo...,https://www.astronomer.io/blog/day-2-operation...


## Split in chunks

In [8]:


splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""])

df = (
    df
    .assign( doc_chunks = lambda df: df["content"].apply(lambda x: splitter.split_documents([Document(page_content=x)])))
    .explode("doc_chunks", ignore_index=True)
    .assign( content = lambda df: df["doc_chunks"].apply(lambda x: x.page_content))
    .drop(["doc_chunks"], axis=1)
    .reset_index(drop=True)
)

df

Unnamed: 0,docSource,sha,content,docLink
0,astro blog,436b9648-1c86-5c86-8a90-1c251d578774,# Introducing Apache Airflow™ on Astro – an Az...,https://www.astronomer.io/blog/introducing-apa...
1,astro blog,436b9648-1c86-5c86-8a90-1c251d578774,"How to Get Started\nGetting started is easy, a...",https://www.astronomer.io/blog/introducing-apa...
2,astro blog,436b9648-1c86-5c86-8a90-1c251d578774,The top-level entity in Astro is an Organizati...,https://www.astronomer.io/blog/introducing-apa...
3,astro blog,436b9648-1c86-5c86-8a90-1c251d578774,First-class support for Azure Private Networki...,https://www.astronomer.io/blog/introducing-apa...
4,astro blog,4017cd9d-86f5-5466-b7e5-e5a118bd290f,# Apache Airflow TaskFlow API vs. Traditional ...,https://www.astronomer.io/blog/apache-airflow-...
...,...,...,...,...
124,astro blog,cc4473b1-91eb-5249-8012-cfc6b6e33a11,# Introducing Support for the Kubernetes Execu...,https://www.astronomer.io/blog/introducing-sup...
125,astro blog,8dcf9888-22b4-568d-90e7-85fc84e401b9,# Get Improved Data Quality Checks in Airflow ...,https://www.astronomer.io/blog/improved-data-q...
126,astro blog,8dcf9888-22b4-568d-90e7-85fc84e401b9,"At Astronomer, we use Great Expectations for s...",https://www.astronomer.io/blog/improved-data-q...
127,astro blog,8dcf9888-22b4-568d-90e7-85fc84e401b9,gx_data_quality_checks = GreatExpectationsOper...,https://www.astronomer.io/blog/improved-data-q...


In [5]:
df.to_parquet("../dags/astro_blog.parquet")

In [None]:
a = pd.read_parquet("/Users/icabral/repos/VectorVault/dags/astro_blog.parquet").to_dict(orient="records")

In [None]:
[pd.read_parquet("./dags/astro_blog.parquet").to_dict(orient="records")[0:1]

In [None]:
len(a[28:29][0]["content"])

In [None]:
a[28]['content'][8400:8450]

In [None]:
a
