# Web_Scrapping Script

## Me salva

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def load_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error loading page {url}: {e}")
        return None

def get_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            rows = soup.find_all('div', class_='row')
            links = [row.find('a')['href'] for row in rows if row.find('a')]
            return links
    except Exception as e:
        print(f"Error getting subject links from {url}: {e}")
    return []

def get_especific_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            div = soup.find('div', class_='entry-content clearfix')
            if div:
                lis = div.find_all('li')
                urls = [li.find('a')['href'] for li in lis if li.find('a')]
                return urls
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

def get_content(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            main = soup.find('main', id='main')
            if main:
                paragraphs = main.find_all('p')
                concatenated_text = '\n'.join(paragraph.text for paragraph in paragraphs)
                title = soup.find('h1', class_='entry-title').text if soup.find('h1', class_='entry-title') else "No title found"
                return title, concatenated_text
    except Exception as e:
        print(f"Error getting content from {url}: {e}")
    return "No title found", ""

from tqdm import tqdm  # Make sure to import tqdm

def scrape_subjects_to_dataframe(base_url, subjects):
    data = []
    for subject in subjects:
        subject_url = f"{base_url}{subject}/"
        print(f"Scraping {subject.capitalize()}...")
        subject_links = get_subject_links(subject_url)
        for subject_link in tqdm(subject_links, desc=f"{subject.capitalize()} Subjects"):
            specific_links = get_especific_subject_links(subject_link)
            for specific_url in tqdm(specific_links, desc=f"{subject.capitalize()} Sub-subjects", leave=False):
                title, content = get_content(specific_url)
                sub_subject = specific_url.split('/')[-2]  # Assuming the sub-subject is the second last part of the URL
                data.append({
                    "title": title,
                    "url": specific_url,
                    "content": content,
                    "subject": subject.capitalize(),
                    "sub-subject": sub_subject
                })
                if len(data) % 500 == 0:
                    print(f"Scraped {len(data)} pages")
    return pd.DataFrame(data)

# Define the base URL and subjects
base_url = "https://resumos.mesalva.com/"
subjects = [
    "matematica", "fisica", "quimica", "biologia", "historia",
    "geografia", "filosofia", "sociologia", "portugues",
    "literatura", "artes"
]

# Scrape data and create DataFrame
df = scrape_subjects_to_dataframe(base_url, subjects)

# Optionally, save the DataFrame to a CSV file
df.to_csv("scraped_data.csv", index=False)

# Display the first few rows of the DataFrame
print(df.head())


In [None]:
df = pd.read_csv("scraped_data.csv")

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")# df.to_csv("scraped_data.csv", index=False)
df["token_count"] = df["content"].apply(lambda x: len(tokenizer.encode(str(x))))

# Sum up the total number of tokens
total_tokens = df['token_count'].sum()


In [None]:
# df.to_csv("scraped_data.csv", index=False)
df["token_count"] = df["content"].apply(lambda x: len(tokenizer.encode(str(x))))

# Sum up the total number of tokens
total_tokens = df['token_count'].sum()


In [None]:
total_tokens

## Kuadro

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def load_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error loading page {url}: {e}")
        return None

def get_subject_links(url):
    try:
        url = 'https://www.kuadro.com.br/resumos-enem-vestibulares'
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get all div class="Summaries__CourseRow-sc-4nmrha-0 gWDZhf"
            rows = soup.find_all('div', class_='Summaries__CourseRow-sc-4nmrha-0 gWDZhf')
            # for each row, get all class="Summaries__Category-sc-4nmrha-3 deMeER"
            urls = []
            for row in rows:
                categories = row.find_all('a', class_='Summaries__Category-sc-4nmrha-3 deMeER')
                urls += [category['href'] for category in categories]  
        return urls 
    except Exception as e:
        print(f"Error getting subject links from {url}: {e}")
    return []

def get_especific_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            div = soup.find('div', class_='entry-content clearfix')
            if div:
                lis = div.find_all('li')
                urls = [li.find('a')['href'] for li in lis if li.find('a')]
                return urls
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

def get_content(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            main = soup.find('main', id='main')
            if main:
                paragraphs = main.find_all('p')
                concatenated_text = '\n'.join(paragraph.text for paragraph in paragraphs)
                title = soup.find('h1', class_='entry-title').text if soup.find('h1', class_='entry-title') else "No title found"
                return title, concatenated_text
    except Exception as e:
        print(f"Error getting content from {url}: {e}")
    return "No title found", ""

from tqdm import tqdm  # Make sure to import tqdm

def scrape_subjects_to_dataframe(base_url, subjects):
    data = []
    for subject in subjects:
        subject_url = f"{base_url}{subject}/"
        print(f"Scraping {subject.capitalize()}...")
        subject_links = get_subject_links(subject_url)
        for subject_link in tqdm(subject_links, desc=f"{subject.capitalize()} Subjects"):
            specific_links = get_especific_subject_links(subject_link)
            for specific_url in tqdm(specific_links, desc=f"{subject.capitalize()} Sub-subjects", leave=False):
                title, content = get_content(specific_url)
                sub_subject = specific_url.split('/')[-2]  # Assuming the sub-subject is the second last part of the URL
                data.append({
                    "title": title,
                    "url": specific_url,
                    "content": content,
                    "subject": subject.capitalize(),
                    "sub-subject": sub_subject
                })
                if len(data) % 500 == 0:
                    print(f"Scraped {len(data)} pages")
    return pd.DataFrame(data)

# Define the base URL and subjects
base_url = "https://resumos.mesalva.com/"
subjects = [
    "matematica", "fisica", "quimica", "biologia", "historia",
    "geografia", "filosofia", "sociologia", "portugues",
    "literatura", "artes"
]

# Scrape data and create DataFrame
df = scrape_subjects_to_dataframe(base_url, subjects)

# Optionally, save the DataFrame to a CSV file
#df.to_csv("scraped_data.csv", index=False)

# Display the first few rows of the DataFrame
print(df.head())


In [None]:
url = 'https://www.kuadro.com.br/resumos-enem-vestibulares'
response = load_page(url)
if response:
    soup = BeautifulSoup(response.content, 'html.parser')
    # get all div class="Summaries__CourseRow-sc-4nmrha-0 gWDZhf"
    rows = soup.find_all('div', class_='Summaries__CourseRow-sc-4nmrha-0 gWDZhf')
    # for each row, get all class="Summaries__Category-sc-4nmrha-3 deMeER"
    urls = []
    for row in rows:
        categories = row.find_all('a', class_='Summaries__Category-sc-4nmrha-3 deMeER')
        urls += [category['href'] for category in categories]
                                  

In [None]:
url = 'https://www.kuadro.com.br/resumos-enem-vestibulares/'

In [None]:
response = load_page(url)
if response:
    soup = BeautifulSoup(response.content, 'html.parser')
    div = soup.find('div', class_='entry-content clearfix')
    if div:
        lis = div.find_all('li')
        urls = [li.find('a')['href'] for li in lis if li.find('a')]
        

## Kuadro

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm  # Ensure tqdm is installed or remove if not needed

In [None]:
def load_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error loading page {url}: {e}")
        return None

In [None]:

def get_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get all div class="Summaries__CourseRow-sc-4nmrha-0 gWDZhf"
            rows = soup.find_all('div', class_='Summaries__CourseRow-sc-4nmrha-0 gWDZhf')
            # for each row, get all class="Summaries__Category-sc-4nmrha-3 deMeER"
            urls = []
            for row in rows:
                categories = row.find_all('a', class_='Summaries__Category-sc-4nmrha-3 deMeER')
                urls += [category['href'] for category in categories]
            return urls
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

def get_especific_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            a = soup.find_all('a', class_='Categories__CategoryRow-sc-e5b8e1-0 vxFdC')
            urls = [category['href'] for category in a]
        
            return urls 
    
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

def get_content(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get div resumo__Body-sc-v675m5-4 bQlNAk
            div = soup.find('div', class_='resumo__Body-sc-v675m5-4 bQlNAk')
            # get all text inside that div 
            text = div.text
            title = soup.find('h1', class_='Head__Title-sc-17jjrd5-2 eUaNXA').text
        return title, text
    except Exception as e:
        print(f"Error getting content from {url}: {e}")
    return "No title found", ""

In [None]:
def scrape_subjects_to_dataframe(base_url, resume_url):
    data = []
    url = f"{base_url}{resume_url}"
    # subject_url = f"{base_url}{subject}/"
    print(f"Scraping {resume_url.capitalize()}...")
    subject_links = get_subject_links(url)
    for subject_link in tqdm(subject_links, desc=f"Subjects"):
        subject_link = f"{base_url}/{subject_link}"
        specific_links = get_especific_subject_links(subject_link)
        for specific_url in tqdm(specific_links, desc=f"Sub-subjects", leave=False):
            specific_url = f"{base_url}/{specific_url}"
            title, content = get_content(specific_url)
            sub_subject = specific_url.split('/')[-1]  # Assuming the sub-subject is the second last part of the URL
            subject = specific_url.split('/')[-3]
            data.append({
                "title": title,
                "url": specific_url,
                "content": content,
                "subject": subject.capitalize(),
                "sub-subject": sub_subject
            })
            # print(f"titulo: {title}, subject: {subject.capitalize()}, sub-subject: {sub_subject}")
            if len(data) % 500 == 0:
                print(f"Scraped {len(data)} pages")
    return pd.DataFrame(data)

# Define the base URL and subjects
base_url = "https://www.kuadro.com.br"
resume_url = "/resumos-enem-vestibulares"

# Scrape data and create DataFrame
df = scrape_subjects_to_dataframe(base_url, resume_url)

# Optionally, save the DataFrame to a CSV file
df.to_csv("kuadro_resume.csv", index=False)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
# url has double / after .com.br remove
# df['url'] = df['url'].str.replace('//', '/')
df.to_csv("kuadro_resume.csv", index=False)

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")# df.to_csv("scraped_data.csv", index=False)
df["token_count"] = df["content"].apply(lambda x: len(tokenizer.encode(str(x))))

# Sum up the total number of tokens
total_tokens = df['token_count'].sum()


In [None]:
total_tokens

## Brasil Escola


In [None]:
def get_especific_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            a = soup.find_all('a', class_='layer-artigo--relacionado sec')
            b = soup.find_all('a', class_='layer-artigo--relacionado')
            urls = [category['href'] for category in a]
            urls += [category['href'] for category in b]
            return urls 
    
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

url = 'https://brasilescola.uol.com.br/matematica'
len(get_especific_subject_links(url))


In [None]:
response = load_page(url)

In [None]:
def get_content(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get div resumo__Body-sc-v675m5-4 bQlNAk
            div = soup.find('div', class_='texto-completo')
            # get all text inside that div 
            text = div.text
            title = soup.find('h1', class_='titulo-interna mb-4').text
        return title, text
    except Exception as e:
        print(f"Error getting content from {url}: {e}")
    return "No title found", ""
print(get_content('https://brasilescola.uol.com.br/matematica/teorema-pitagoras.htm'))

## Quero Bolsa

In [None]:
def get_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get all div class="Summaries__CourseRow-sc-4nmrha-0 gWDZhf"
            divs = soup.find_all('div', class_='p-2')
            # for each row, get all class="Summaries__Category-sc-4nmrha-3 deMeER"
            urls = []
            for div in divs:
                # get <a href...> inside the div and append
                a = div.find('a')
                urls.append(a['href'])
            return urls
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []


def get_especific_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            a = soup.find_all('a', class_='layer-artigo--relacionado sec')
            b = soup.find_all('a', class_='layer-artigo--relacionado')
            urls = [category['href'] for category in a]
            urls += [category['href'] for category in b]
            return urls 
    
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

def get_content(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get div resumo__Body-sc-v675m5-4 bQlNAk
            div = soup.find('div', class_='texto-completo')
            # get all text inside that div 
            text = div.text
            title = soup.find('h1', class_='titulo-interna mb-4').text
        return title, text
    except Exception as e:
        print(f"Error getting content from {url}: {e}")
    return "No title found", ""

In [None]:
def scrape_subjects_to_dataframe(base_url):
    data = []
    url = f"{base_url}{resume_url}"
    # subject_url = f"{base_url}{subject}/"
    print(f"Scraping {resume_url.capitalize()}...")
    subject_links = get_subject_links(url)
    for subject_link in tqdm(subject_links, desc=f"Subjects"):
        subject_link = f"{base_url}/{subject_link}"
        specific_links = get_especific_subject_links(subject_link)
        for specific_url in tqdm(specific_links, desc=f"Sub-subjects", leave=False):
            specific_url = f"{base_url}/{specific_url}"
            title, content = get_content(specific_url)
            sub_subject = specific_url.split('/')[-1]  # Assuming the sub-subject is the second last part of the URL
            subject = specific_url.split('/')[-3]
            data.append({
                "title": title,
                "url": specific_url,
                "content": content,
                "subject": subject.capitalize(),
                "sub-subject": sub_subject
            })
            # print(f"titulo: {title}, subject: {subject.capitalize()}, sub-subject: {sub_subject}")
            if len(data) % 500 == 0:
                print(f"Scraped {len(data)} pages")
    return pd.DataFrame(data)

# Define the base URL and subjects
base_url = "https://www.kuadro.com.br"
resume_url = "/resumos-enem-vestibulares"

# Scrape data and create DataFrame
df = scrape_subjects_to_dataframe(base_url, resume_url)

# Optionally, save the DataFrame to a CSV file
df.to_csv("kuadro_resume.csv", index=False)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")# df.to_csv("scraped_data.csv", index=False)
df["token_count"] = df["content"].apply(lambda x: len(tokenizer.encode(str(x))))

# Sum up the total number of tokens
total_tokens = df['token_count'].sum()


## Quero Bolsa

In [None]:
def get_content(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get div resumo__Body-sc-v675m5-4 bQlNAk
            sections = soup.find_all('section')
            # get all text in each section and then concat all 
            title = soup.find('h1').text
            text = '\n'.join(section.text for section in sections[:-1])
            return text, title

        return title, text
    except Exception as e:
        print(f"Error getting content from {url}: {e}")
    return "No title found", ""



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm  # Ensure tqdm is installed or remove if not needed

In [None]:
def load_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error loading page {url}: {e}")
        return None

In [None]:
def get_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get all div class="Summaries__CourseRow-sc-4nmrha-0 gWDZhf"
            divs = soup.find_all('div', class_='z-card categories__card js-category-card z-card--shadow-high')
            # for each row, get all class="Summaries__Category-sc-4nmrha-3 deMeER"
            urls = []
            for div in divs:
                # get <a href...> inside the div and append
                a = div.find('a')
                urls.append(a['href'])
            return urls
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

def get_especific_subject_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            aas = soup.find_all('a', class_='z-link category__lessons-link')
            urls = [category['href'] for category in aas]
            return urls
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

def get_content(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            # get div resumo__Body-sc-v675m5-4 bQlNAk
            sections = soup.find_all('section')
            # get all text in each section and then concat all 
            title = soup.find('h1').text
            text = '\n'.join(section.text for section in sections[:-1])
            return title, text

        return title, text
    except Exception as e:
        print(f"Error getting content from {url}: {e}")
    return "No title found", ""


In [None]:
def scrape_subjects_to_dataframe(base_url, resume_url):
    data = []
    url = f"{base_url}{resume_url}"
    # subject_url = f"{base_url}{subject}/"
    print(f"Scraping {resume_url.capitalize()}...")
    subject_links = get_subject_links(url)
    for subject_link in tqdm(subject_links, desc=f"Subjects"):
        subject_link = f"{base_url}{subject_link}"
        specific_links = get_especific_subject_links(subject_link)
        for specific_url in tqdm(specific_links, desc=f"Sub-subjects", leave=False):
            specific_url = f"{base_url}{specific_url}"
            title, content = get_content(specific_url)
            sub_subject = specific_url.split('/')[-1]  # Assuming the sub-subject is the second last part of the URL
            subject = specific_url.split('/')[-2]
            data.append({
                "title": title,
                "url": specific_url,
                "content": content,
                "subject": subject.capitalize(),
                "sub-subject": sub_subject
            })
            # print(f"titulo: {title}, subject: {subject.capitalize()}, sub-subject: {sub_subject}")
            # print(f"c: {content}")
            if len(data) % 500 == 0:
                print(f"Scraped {len(data)} pages")
    return pd.DataFrame(data)

# Define the base URL and subjects
base_url = "https://querobolsa.com.br"
resume_url = "/enem/manual-do-enem"

# Scrape data and create DataFrame
df = scrape_subjects_to_dataframe(base_url, resume_url)

# Optionally, save the DataFrame to a CSV file
df.to_csv("quero_bolsa.csv", index=False)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")# df.to_csv("scraped_data.csv", index=False)
df["token_count"] = df["content"].apply(lambda x: len(tokenizer.encode(str(x))))

# Sum up the total number of tokens
total_tokens = df['token_count'].sum()


In [None]:
total_tokens

## Brasil Escola Questions

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm  # Ensure tqdm is installed or remove if not needed
def load_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error loading page {url}: {e}")
        return None

In [None]:
def get_especific_subject_questions_links(url):
    try:
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            divs = soup.find_all('div', class_='single-exerc')
            links = [div.find('a')['href'] for div in divs if div.find('a')]
            return links
    except Exception as e:
        print(f"Error getting specific subject links from {url}: {e}")
    return []

def get_questions(url):
    try:
        subject = url.split('/')[-2]
        
        response = load_page(url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            questions = soup.find_all('div', class_='question-box')
            title = soup.find('h1').text
            questions_list = []
            for question in questions:
                try: 
                    # question text is in all p in a div question-text concat then with \n
                    question_text = '\n'.join(p.text for p in question.find('div', class_='question-text').find_all('p'))
                    answer_text = question.find('div', class_='answer-text').text
                    answer_item = answer_text[7] if answer_text[7] in ['A', 'B', 'C', 'D', 'E'] else ""
                    
                    questions_list.append({
                        "question": question_text,
                        "answer": answer_item,
                        "answer_text": answer_text,
                        "subject": subject,
                        "title": title
                    })
                except Exception as e:
                    print(f"Error getting question")
                    
                
            return questions_list

        # return title, text
    except Exception as e:
        print(f"Error getting content from {url}: {e}")
    return "No title found", ""

In [None]:
def scrape_questions_to_dataframe(subjects_links):
    data = []
    
    
    print(f"Scraping {subjects_links[0].split('/')[2]}...")
    
    for subject_link in tqdm(subjects_links, desc=f"Subjects"):
        
        specific_links = get_especific_subject_questions_links(subject_link)
        for specific_url in tqdm(specific_links, desc=f"Sub-subjects", leave=False):
            
            questions = get_questions(specific_url)
            data += questions

            if len(data) % 500 == 0:
                print(f"Scraped {len(data)} pages")
    return pd.DataFrame(data)

# Define the base URL and subjects
subjects_list = ["exercicios-geografia", "exercicios-geografia-do-brasil","exercicios-historia-do-brasil","exercicios-historia","exercicios-historia-da-america", "exercicios-literatura", "exercicios-redacao", "exercicios-gramatica", "exercicios-biologia", "exercicios-fisica", "exercicios-matematica", "exercicios-quimica", "exercicios-sociologia", "exercicios-ingles", "exercicios-filosofia"] 
subjects_links = [f"https://exercicios.brasilescola.uol.com.br/{subject}" for subject in subjects_list]

# Scrape data and create DataFrame
df = scrape_questions_to_dataframe(subjects_links)

# Optionally, save the DataFrame to a CSV file
df.to_csv("brasil_escola_questions.csv", index=False)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
# plot a beautiful hihstogram by subject, i split all subjects by - and remove the first word
import matplotlib.pyplot as plt
import seaborn as sns

df['subject_1'] = df['subject'].apply(lambda x: ' '.join(x.split('-')[1:2]))
plt.figure(figsize=(10, 6))
# do a lateral plot because the names are too big
sns.histplot(df, x='subject_1', kde=True)
plt.xticks(rotation=45)
plt.show()
# sns.histplot(df, x='subject', kde=True)

In [None]:
questions = get_questions('https://exercicios.brasilescola.uol.com.br/exercicios-geografia/enem-lista-de-exercicios-sobre-crescimento-populacional-e-teorias-demograficas.htm')

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")# df.to_csv("scraped_data.csv", index=False)
df["token_question"] = df["question"].apply(lambda x: len(tokenizer.encode(str(x))))
df["token_answer"] = df["answer_text"].apply(lambda x: len(tokenizer.encode(str(x))))
# Sum up the total number of tokens
total_tokens = df['token_question'].sum() + df['token_answer'].sum()
