In [103]:
!pip install pypdf
!pip install langchain_community
!pip install -U langchain-text-splitters
!pip install tabulate

In [None]:
!pip install pydantic

In [None]:
!pip install scrappy

In [None]:
!pip install goose3
!pip install --upgrade pymupdf
!pip install --upgrade fitz

## With this notebook we are going to parse Thoughtworks information from their website.

- Then we want use this information to finetune a model and/or build a RAG application QA chatbot using an LLM
- We are gonna scrap info from web pages and pdfs as the first approach.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from pydantic import BaseModel
from typing import List, Optional

class DataModel(BaseModel):
    """
    Data model with the interesting info we want to store to train or build the vector db with embedding for the rag
    """
    domain: Optional[str]
    url: str
    title: Optional[str]
    text: str
    hyperlinks: Optional[List[str]]
    meta_description: Optional[str]
    authors: Optional[List[str]]

In [71]:
DataModel(domain=None, url="http://dada", text="dasdasd", title=None, hyperlinks=None, meta_description=None, authors=None)


DataModel(domain=None, url='http://dada', title=None, text='dasdasd', hyperlinks=None, meta_description=None, authors=None)

In [2]:
import re
def filter_urls_en_es(input_text):
    """
    Identify urls with language es or en.
    """
    pattern_lang = re.compile(r"https*://.*/[A-Za-z][A-Za-z]-[A-Za-z][A-Za-z]/.*", re.IGNORECASE)
    pattern_enes = re.compile(r"https*://.*/(?:EN|en|es|Es)-[A-Za-z][A-Za-z]/.*", re.IGNORECASE)
    if pattern_lang.match(input_text):
        if not pattern_enes.match(input_text):
            return None
    return input_text
    
filter_urls_en_es("https://www.thoughtworks.com/EM-es/radar")

In [3]:
def download_pdf(url):
    """
    Download pdf and save it in a temp folder so we can parse it after in local path
    """
    import requests
    from datetime import datetime
    import random
    chunk_size = 2000
    r = requests.get(url, stream=True)
    filepath = f'tmp/qa{random.randint(1000, 1000000)}.pdf'
    with open(filepath, 'wb') as fd:
        for chunk in r.iter_content(chunk_size):
            fd.write(chunk)
        return filepath

In [4]:
import fitz

In [5]:

import urllib
def scrap_pdf(url):
    """
    Function to scrap pdf with a lower level package (fitz), so we can access to internal hyperlinks.
    """
    links = []
    models = []
    filepath = download_pdf(url)
    doc = fitz.open(filepath)
    docs = []
    for page in doc: # iterate the document pages
        text = page.get_text()
        link = page.first_link  # a `Link` object or `None`
        page_links = []
        try:
            page_links.append(str(link.uri))
            while link: # iterate over the links on page
                # do something with the link, then:
                try:
                    link = link.next
                    page_links.append(str(link.uri))
                except:
                    continue
        except:
            continue
        urlobj = urllib.parse.urlparse(url)
        model = DataModel(authors=[], domain=f"{urlobj.scheme}://{urlobj.hostname}", hyperlinks=page_links, meta_description='', text=text, title='', url=url)
        models.append(model)
        links.extend(page_links)
        #print("__________________________________________________________________")
    return models , links

In [62]:
def scraper_html(site: str, recursive: bool):
    """
    Scrapping url and their content, support an web page or a pdf.
    Steps:
    - Parse a web|pdf resource.
    - Extract text and other metadata, and add found url to the queue or links.
    - Save each data_row model as a DataModel dict.
    """
    if site.endswith(".pdf") or site.endswith(".PDF"):
        models, links_doc = scrap_pdf(url=site)
        for data_model in models:
            all_records.append(data_model.dict())
        all_links.extend(links_doc)
    elif site.startswith("http"):
        try:
            site_data = goose.extract(url=site)
        except:
            print("Extract failed for", site)
            return
        
        links = site_data.links
        clean_links = []
        def morelinks_from_soap(raw_html):
            try:
                s = BeautifulSoup(raw_html, "html.parser")
                for i in s.find_all("a"):
                    try:
                        href = i.attrs['href']
                        links.append(href)
                    except Exception as ex:
                        print("morelinks_from_soap failed 1", ex)
            except Exception as ex:
                print("morelinks_from_soap failed 2", ex)
        
        morelinks_from_soap(site_data.raw_html)
        links = set(links)

        for link in links:
            if not filter_urls_en_es(link):
                print(f"Skip filter filter_urls_en_es; {filter_urls_en_es}")
                continue
            if len(link) <=2:
                print("skipping", link)
            if link.startswith("/"):
                link = urllib.parse.urljoin(site, link)
            
            link = link.strip()
            
            if link.startswith("http"):
                clean_links.append(link)
        # if the call was recursive add more links else skip then.
        if recursive:
            all_links.extend([link for link in  clean_links if link not in all_links])
        
        data_model = DataModel(
            domain=site_data.domain if site_data.domain else urllib.parse.urlparse(site).hostname,
            url=site,
            title=site_data.title,
            hyperlinks=clean_links,
            text=site_data.cleaned_text,
            meta_description=site_data.meta_description,
            authors= site_data.authors
        )
        all_records.append(data_model.dict())
    else:
        print(f"Unsupported url: {site}")


In [79]:
columns = list(DataModel.schema()["properties"].keys())
all_links = ["https://www.thoughtworks.com/radar", "https://www.thoughtworks.com/"]
url_bases = [urllib.parse.urlparse(link).hostname for link in  all_links]
all_records = []


In [None]:
import pandas as pd
import urllib
from goose3 import Goose

goose = Goose()

index = 0

while index < len(all_links):
    print(index, len(all_links))
    link = all_links[index]
    if link in list(df["url"].values):
        print("Link already mined", link)
        index += 1
        continue
    print(f"Processing link: {link}")
    current_link_hostname = urllib.parse.urlparse(link).hostname
    recursive = True if current_link_hostname in url_bases else False
    scraper_html(all_links[index], recursive=recursive)
    index += 1
    

df = pd.DataFrame(all_records )

In [87]:
import pickle

with open('all_records.pickle', 'wb') as handle:
    pickle.dump(all_records, handle, protocol=pickle.HIGHEST_PROTOCOL)


import pickle
with open('tmall_records.pickle', 'wb') as handle:
    pickle.dump(all_records, handle, protocol=pickle.HIGHEST_PROTOCOL)

##### Since it is not too much data save the whole record in a single file
df = pd.DataFrame(all_records )
df.to_parquet("raw_scrap_thoughtworks.parquet")

### Preprocessing Extracted Data

In [410]:
df = pd.read_parquet("raw_scrap_thoughtworks.parquet")

#### Filter some repeated rows, need to check the scraper since repeated urls  should not happend

In [411]:
df = df.sort_values("url")

In [412]:
df["len_text"] = df["text"].apply(len)

In [413]:
df = df[df.len_text > 200]

In [414]:
df = df.sort_values(["url", "len_text"], ascending=False)

In [415]:
df = df.drop_duplicates(["url", "text", "len_text"], keep="first")

#### Add some columns info to support more filters

In [417]:
def get_more_lang_pdfs(input_text):
    """
    Extract language from url
    """
    pattern_date = re.compile(r".*(vol_[0-9][0-9]_[A-Za-z][A-Za-z])/*", re.IGNORECASE)

    if pattern_date.match(input_text):
        match2 = re.findall(pattern_date, input_text)
        return match2[0].split("_")[-1]
    
    pattern_date = re.compile(r".*(/[A-Za-z][A-Za-z]/)/*", re.IGNORECASE)

    if pattern_date.match(input_text):
        match2 = re.findall(pattern_date, input_text)
        return match2[0][1:3]
    return ''


In [418]:
def extract_date(input_text):
    """
    Extract date from url
    """
    pattern_date = re.compile(r".*/([0-9][0-9][0-9][0-9]/[0-9][0-9])/*", re.IGNORECASE)

    if pattern_date.match(input_text):
        match2 = re.findall(pattern_date, input_text)
        return match2[0].replace("/", "")

    return ''

def fill_titles(row):
    if row["url"].endswith(".pdf"):
        text = row["text"]
        title = ''
        for el in text.split('\n'):
            if len(el) >= 4 and  not el.isnumeric() and "thoughtworks" not in el.lower():
                title = el
                break
        return title
    else:
        return row["title"]

df["title_fill"] = df.apply(fill_titles, axis=1)

In [419]:
def filter_urls_en_es_v2(row):
    """
    Add language and date columns with info extracted from url to DF
    """
    input_text = row["url"]
    
    pattern_lang = re.compile(r"https*://.*/[A-Za-z][A-Za-z]-[A-Za-z][A-Za-z]/.*", re.IGNORECASE)
    pattern_enes = re.compile(r".*/((?:EN|en|es|Es)-[A-Za-z][A-Za-z])/*", re.IGNORECASE)
    lang = ""
    priority = 10
    match_url = row["url"]
    if pattern_lang.match(input_text):
        if pattern_enes.match(input_text):
            match = re.findall(pattern_enes, input_text)[0].lower()
            lang, lang2 = match.split("-")
            match_url = input_text.replace(match, "_")
            if lang2 in ["us", "en", 'es']:
                priority = 2
            else:
                priority = 1
    row["lang"] = lang
    row["lang_priority"] = priority
    row["match_url"] = match_url
    row["date"] = extract_date(input_text)
    if not lang:
        row["lang"] = get_more_lang_pdfs(input_text)
    return row
df = df.apply(filter_urls_en_es_v2, axis=1)

In [423]:
df.shape

(9309, 13)

#### Dropout langs != es|en and docs with date in title != 2024 to reduce the dataset

In [424]:
df =df[df["lang"].isin(["es", "en", "ES", "EN", "ai", ""])]
df =df[(df["date"].str.contains("2024")) | (df["date"] == '')]

In [425]:
df.shape

(7030, 13)

In [542]:
# Print some text extracted, first 500 characters
for i, row in df.head(2).iterrows():
    print(row["title"] if  row["title"] else "NO TITLE")
    print(row["text"][:500])
    print("_____________________________________")

Home ⚡ Zig Programming Language
Focus on debugging your application rather than debugging your programming language knowledge. A fresh approach to metaprogramming based on compile-time code execution and lazy evaluation.
• Use Zig as a zero-dependency, drop-in C/C++ compiler that supports cross-compilation out-of-the-box.
• Leverage to create a consistent development environment across all platforms.
• Add a Zig compilation unit to C/C++ projects; cross-language LTO is enabled by default. // Ensure the list is freed at scope e
_____________________________________
ZoomInfo Privacy Policy

• None ZoomInfo Technologies LLC (“ZoomInfo”) understands that you care about how information about you is used. This Privacy Policy (the “Policy”) explains how we collect information pertaining to businesses and business people (“Business Information”) and all other types of information, including personal information, through our online services (the “Services”), website, and mobile applications (co

In [546]:
df.sample(3)

Unnamed: 0,domain,url,title,text,hyperlinks,meta_description,authors,len_text,title_fill,lang,lang_priority,match_url,date,questions
4718,www.thoughtworks.com,https://www.thoughtworks.com/radar/tools/mixtral,Mixtral | Technology Radar,Worth exploring with the goal of understanding...,[https://www.thoughtworks.com/careers/consulta...,Mixtral is part of the family of open-weight l...,[],840,Mixtral | Technology Radar,,10,https://www.thoughtworks.com/radar/tools/mixtral,,[]
8352,www.oreilly.com,https://www.oreilly.com/library/view/data-mesh...,Data Mesh,"We're at an inflection point in data, where ou...",[https://learning.oreilly.com/library/view/dat...,"We're at an inflection point in data, where ou...",[],1071,Data Mesh,,10,https://www.oreilly.com/library/view/data-mesh...,,[]
6017,www.thoughtworks.com,https://www.thoughtworks.com/es-ec/what-we-do/...,"Experiencia del Cliente, Producto y Diseño","Sorprende a tus clientes, supera a tus competi...",[https://www.thoughtworks.com/en-de/what-we-do...,Alineamos el pensamiento de producto con la ex...,[],1463,"Experiencia del Cliente, Producto y Diseño",es,1,https://www.thoughtworks.com/_/what-we-do/cust...,,[Preguntas como:\n• None ¿Cómo podemos prioriz...


In [7]:
import re

def extract_questions(text):
    question_regex = r'([¿A-Z][^?!.]*\?)'
    questions = re.findall(question_regex, text)
    return questions


In [527]:
questions = df["text"].apply(extract_questions)

In [528]:
df["questions"] = questions

In [551]:
# Rows with questions
df[df["questions"].str.len() > 0].shape[0]

1160

In [534]:
df["title"] = df["title_fill"]
df[["domain", "url", "title", "questions", "text", "authors", "meta_description", "lang"]].to_parquet("thoughtworks_cleaned_dataset.parquet")

In [536]:
df.shape

(7030, 14)

#### Check some questions extracted with regex

In [531]:
questions = []
for index, row in df[df["questions"].str.len() > 0].reset_index(drop=True).sample(20).iterrows():
    print("Url: ", row["url"])
    print("Title: ", row["title"])
    print("Title: ", row["questions"])
    #print(row["text"])
    print("________________________________")

Url:  https://www.zoominfo.com/about-zoominfo/privacy-policy
Title:  ZoomInfo Privacy Policy
Title:  ['Where does ZoomInfo get the Business Information for its Professional and Business Profiles?', 'How does ZoomInfo Lite (formerly Community Edition) and the ZoomInfo Contact Contributor Work?', 'How does ZoomInfo handle customer information?', 'How else does ZoomInfo Collect Information?']
________________________________
Url:  https://www.w3.org/WAI/standards-guidelines/wcag/
Title:  WCAG 2 Overview
Title:  ['For links to introductory material, see “Where should I start?']
________________________________
Url:  https://www.thoughtworks.com/what-we-do/innovation-during-recession
Title:  Innovation as a solution to uncertain times
Title:  ['How can you ensure your business can weather the storm and come out ahead?']
________________________________
Url:  https://www.thoughtworks.com/what-we-do/enterprise-modernization-platforms-cloud/green-cloud
Title:  Green cloud
Title:  ['Cloud’s tra

In [3]:
import pickle

with open('./results2.pickle', 'rb') as f:
    x = pickle.load(f)


In [6]:
for i in x:
    print(i[2].ljust(80))


  Based on the provided context, there is no clear answer regarding the specific scale at which it pays off to fine-tune a model with an organization's code. The documents discuss the benefits of using large models for coding assistance and the potential move towards open source models, but they do not provide concrete information on the cost-effectiveness or optimal scale for fine-tuning models with custom code. (NOT FOUND)

  According to the provided documents, taking the first step in product thinking involves looking at both what the customer currently wants and what they might need in the future. The documents emphasize the importance of understanding the customer's perspective and creating solutions to meet their needs, even if those needs aren't explicitly stated. They also suggest that identifying gaps in the market and creating solutions for those gaps can lead to new ideas and products. (Reference: Documents 1, 2)

However, the documents do not provide explicit answers to t