# G1 scraping
This notebook aims to build a scaper of the news in https://g1.globo.com/

## Setup

In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from bs4 import BeautifulSoup
from time import sleep
from selenium.webdriver.common.by import By
import pandas as pd
from datetime import datetime, timedelta

# Path to geckodriver executable
geckodriver_path = '/snap/bin/firefox.geckodriver'


s = Service(executable_path=geckodriver_path)
# Create a Firefox webdriver instance
# opens a window
driver = webdriver.Firefox(service=s)

## Start

In [2]:
driver.get('https://g1.globo.com/')

In [3]:
# scrolling the page

scrolls = 10

current_height = driver.execute_script("return document.body.scrollHeight")

for i in range(scrolls):
    # scroll to the end of the page
    driver.execute_script(f"window.scrollTo(0,document.body.scrollHeight)")
    sleep(1)
    new_height = driver.execute_script("return document.body.scrollHeight")
    
    
    # if not autoscoll
    if new_height == current_height:
        # click the 'Veja mais' button
        driver.find_element(By.CSS_SELECTOR, value='.load-more > a:nth-child(1)').click()
        
        # repeat the scroll
        driver.execute_script(f"window.scrollTo(0,document.body.scrollHeight)")
        current_height = driver.execute_script("return document.body.scrollHeight")
    else:
        current_height = new_height
        
    print(f'Scroll {i + 1}, height={current_height}')
    
    # time for the page to load
    sleep(4)

Scroll 1, height=7981
Scroll 2, height=9601
Scroll 3, height=13262
Scroll 4, height=12258
Scroll 5, height=11756
Scroll 6, height=15138
Scroll 7, height=14752
Scroll 8, height=17765
Scroll 9, height=17325
Scroll 10, height=20758


## Scraping the data

In [4]:
html_source = driver.page_source

In [5]:
soup = BeautifulSoup(html_source, 'lxml')

In [6]:
content_blocks = soup.find_all('div', class_ = '_evg')
len(content_blocks)

10

In [7]:
for block in content_blocks:
    news_list = block.find_all('div', class_ = 'feed-post-body')
    
    print(len(news_list))

0
0
49
8
9
5
8
6
8
5


In [8]:
titles = []
times = []
themes = []
headers = []
resumes = []

for block in content_blocks:
    news_list = block.find_all('div', class_ = 'feed-post-body')
    for news in news_list:
        title = news.find('a', class_ = 'feed-post-link gui-color-primary gui-color-hover')
        titles.append(title.text)
        
        header = news.find('span', 'feed-post-header-chapeu')
        headers.append(None if header is None else header.text)

        time = news.find('span', 'feed-post-datetime')
        times.append(None if time is None else time.text)

        theme = news.find('span', 'feed-post-metadata-section')
        themes.append(None if theme is None else theme.text)

        resume = news.find('div', class_='feed-post-body-resumo')
        resumes.append(None if resume is None else resume.text)

        

data = pd.DataFrame({
    'Title': titles,
    'Time': times,
    'Theme': themes,
    'Header': headers,
    'Resume': resumes
})

data
        

Unnamed: 0,Title,Time,Section,Header,Resume
0,PF descobre dispositivos ilegais para roubar d...,Há 2 horas,Política,Sistema clandestino,Varredura encontrou instalações clandestinas e...
1,'Abin paralela' de Bolsonaro espionou ministro...,Há 3 horas,Política,Monitoramento ilegal,
2,VALDO: áudio de Ramagem e Bolsonaro sobre Fláv...,Há 2 horas,Blog do Valdo Cruz,,
3,"Carne, remédios, cerveja, carro: o que pode mu...",Há 8 horas,Economia,Pagamento de impostos,Deputados decidiram que armas não terão tribut...
4,Reforma prevê imposto zero para quase 400 remé...,Há 8 horas,Saúde,De viagra a remédio para asma,
...,...,...,...,...,...
93,Notícias no seu celular: entre no canal do g1 ...,Há 2 meses,,Sempre ligado,
94,Câmara aprova e envia para sanção projeto de m...,Há 30 minutos,Política,,"Texto já havia sido aprovado na Casa, mas volt..."
95,Arroz e ovo quase todo dia: alunos de escolas ...,Há 44 minutos,Rio de Janeiro,,"Em uma das escolas do município da Baixada, de..."
96,Biden dá entrevista nos EUA em meio a pressão ...,Há 60 minutos,Eleições nos EUA 2024,,


In [10]:
def convert_to_datetime(time_str):
    """convert "Há X [time unit]" to datetime"""
    if 'hora' in time_str or 'horas' in time_str:
        # Extract the number of hours
        hours_ago = int(time_str.split(' ')[1])
        # Subtract the hours from the current datetime
        return datetime.now() - timedelta(hours=hours_ago)
    elif 'minuto' in time_str or 'minutos' in time_str:
        # Extract the number of minutes
        minutes_ago = int(time_str.split(' ')[1])
        # Subtract the minutes from the current datetime
        return datetime.now() - timedelta(minutes=minutes_ago)
    elif 'dia' in time_str or 'dias' in time_str:
        # Extract the number of days
        days_ago = int(time_str.split(' ')[1])
        # Subtract the days from the current datetime
        return datetime.now() - timedelta(days=days_ago)
    elif 'mês' in time_str or 'meses' in time_str:
        # Extract the number of months
        months_ago = int(time_str.split(' ')[1])
        # Subtract the months from the current datetime
        # Note: This is an approximation as timedelta does not support months directly
        return datetime.now() - timedelta(days=30*months_ago)
    else:
        return None  # If the format does not match, return None

# Apply the function to the series
converted_dates = data['Time'].apply(convert_to_datetime)

# Print the converted dates
print(converted_dates)


0    2024-07-11 18:13:01.312797
1    2024-07-11 17:13:01.312808
2    2024-07-11 18:13:01.312810
3    2024-07-11 12:13:01.312812
4    2024-07-11 12:13:01.312814
                ...            
93   2024-05-12 20:13:01.312981
94   2024-07-11 19:43:01.312983
95   2024-07-11 19:29:01.312985
96   2024-07-11 19:13:01.312987
97   2024-07-11 19:13:01.312988
Name: Time, Length: 98, dtype: datetime64[ns]
