## Web Scraping - grupos facebook

In [4]:
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from scrapy.selector import Selector
from time import sleep

class GroupsSearch():
    name = "groups-search"

    def __init__(self, is_headless=True, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if is_headless:
            chrome_options = Options()
            chrome_options.add_argument("--headless")  # Run Chrome in headless mode
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")
            self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        else:
            self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

        with open("config.json") as f:
            self.config = json.load(f)

        self.groups = []

    def login_facebook(self):
        """Log in to Facebook and store cookies for subsequent requests"""
        self.driver.get("https://www.facebook.com/")
        sleep(1)  # Give enough time for the page to load

        # Fill in email and password fields, then log in
        email_input = self.driver.find_element(By.NAME, "email")
        password_input = self.driver.find_element(By.NAME, "pass")

        email_input.send_keys(self.config.get("credentials").get("email"))
        sleep(1)  # Slow down the typing
        password_input.send_keys(self.config.get("credentials").get("password"))

        # Click the login button
        login_button = self.driver.find_element(By.NAME, "login")
        sleep(1)
        login_button.click()

        sleep(1)  # Wait for the login to complete

        # Store the cookies after login
        self.cookies = self.driver.get_cookies()

    def start_requests(self):
        """Initial entry point for Scrapy"""
        max_attempts = 3
        
        # First, log in to Facebook
        self.login_facebook()

        # Once logged in, start processing the tag names from the config file
        for site in self.config["tags"]:
            attempts = 0
            success = False
            tagname = site["tagname"]
            articles_limit = site["articles_limit"]
            
            # Perform a search for each tagname in the input field
            self.search_tag(tagname)
            
            # After search, look for "Grupos" and click on "Ver tudo"
            while attempts < max_attempts: # quebra as vezes, tenta de novo
                try:
                    self.click_see_all_groups()
                    success = True  # Se o clique for bem-sucedido, marcar como sucesso
                    break
                except Exception as e:
                    attempts += 1
                    print(f'Falha em encontrar o botão "ver tudo" (grupos) para a tag {tagname}. Tentativa {attempts}')

            if success:
                # Proceed to scrape articles from the groups page
                self.parse_groups_page(articles_limit)
                self.save_groups(tagname)
                self.save_groups(tagname, txt=True)
            self.groups = []

            # Go back to root before processing the next tagname
            self.go_back_to_root()
        
    def go_to_groups_page(self, tagname):
        if not tagname:
            tagname = self.config["tags"][0]["tagname"]
        
        self.search_tag(tagname)
        self.click_see_all_groups()

    def search_tag(self, tagname):
        """Search for a tagname in the combobox input field"""
        # Find the input field with role="combobox"
        search_input = self.driver.find_element(By.XPATH, '//input[@role="combobox"]')
        
        # Clear the input field and insert the tagname
        search_input.clear()
        sleep(1)  # Slow down to avoid detection
        search_input.send_keys(tagname)
        sleep(1)  # Give time for suggestions or auto-complete

        # Simulate pressing Enter
        search_input.send_keys(u'\ue007')
        sleep(2)  # Wait for the search results to load

    def click_see_all_groups(self):
        """Click the 'Ver tudo' link inside the article with 'Grupos' in the feed"""
        # Locate the feed div
        feed_div = self.driver.find_element(By.XPATH, '//div[@role="feed"]')
        sleep(1)

        # Locate the div with role="article" that contains the span with text "Grupos"
        article_div = feed_div.find_element(By.XPATH, './/div[@role="article" and .//span[text()="Grupos"]]')
        sleep(1)

        # Inside the article div, find the first 'a' with role="link" and aria-label="Ver tudo"
        see_all_link = article_div.find_element(By.XPATH, './/a[@role="link" and @aria-label="Ver tudo"]')
        sleep(1)

        # Click the "Ver tudo" link
        see_all_link.click()
        sleep(2)  # Wait for the new page to load

    def go_back_to_root(self):
        """Click the link to return to the root page"""
        root_link = self.driver.find_element(By.XPATH, '//a[@role="link" and @href="/"]')
        sleep(1)  # Slow down before clicking
        root_link.click()
        sleep(2)  # Wait for the root page to load

    def parse_groups_page(self, articles_limit=10):
        """Parse the response for articles on the groups page"""
        sel = Selector(text=self.driver.page_source)
        self.articles = sel.xpath('//div[@role="feed"]//div[@role="article"]')
        count = 0

        print(f"Found {len(self.articles)} articles on the page, looking for {articles_limit} articles")
        # for each batch of 20 articles, scroll down to load more articles
        scroll_limit = articles_limit // 2
        while len(self.articles) < articles_limit:
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            sleep(1)
            sel = Selector(text=self.driver.page_source)
            self.articles = sel.xpath('//div[@role="feed"]//div[@role="article"]')
            print(f"Found {len(self.articles)} articles on the page, looking for {articles_limit} articles")
            scroll_limit -= 1
            if scroll_limit == 0:
                print("Reached the end of the page, stopping the search")
                break

        for article in self.articles:
            if count >= articles_limit:
                break
            self.groups.append(self.unlock_group(count))
            count += 1
    
    def unlock_group(self, count):
        article = self.articles[count]
        groupname = article.xpath('.//a[@role="presentation"]//text()').get()
        print(f"Group name: {groupname}")
        grouplink = article.xpath('.//a[@role="presentation"]/@href').get()
        print(f"Group link: {grouplink}")
        
        # Store or yield the data   
        return {
            'groupname': groupname,
            'grouplink': grouplink
        }
    
    def save_groups(self, tagname, txt=False):
        folder = "./data/"
        firstime = True
        if txt:
            with open(f'{folder}txt/grupos_{tagname}.txt', 'w') as f:
                for group in self.groups:
                    if firstime:
                        f.write(f"{group['grouplink']}")
                    else:
                        f.write(f"\n{group['grouplink']}")
        else:
            with open(f'{folder}json/grupos_{tagname}.json', 'w') as f:
                json.dump(self.groups, f)

    def closed(self, reason):
        """Cleanup the driver when the spider is closed"""
        self.driver.quit()

In [5]:
IS_HEADLESS = False

In [6]:
gs = GroupsSearch(IS_HEADLESS)
gs.start_requests()

Falha em encontrar o botão "ver tudo" (grupos) para a tag floriano. Tentativa 1
Falha em encontrar o botão "ver tudo" (grupos) para a tag floriano. Tentativa 2
Falha em encontrar o botão "ver tudo" (grupos) para a tag floriano. Tentativa 3
Found 9 articles on the page, looking for 40 articles
Found 18 articles on the page, looking for 40 articles
Found 19 articles on the page, looking for 40 articles
Found 28 articles on the page, looking for 40 articles
Found 29 articles on the page, looking for 40 articles
Found 38 articles on the page, looking for 40 articles
Found 39 articles on the page, looking for 40 articles
Found 48 articles on the page, looking for 40 articles
Group name: BOM NEGÓCIO trocas,compras e vendas PICOS-PI
Group link: https://web.facebook.com/groups/2207423832856332/
Group name: Compra e Venda Picos-PI
Group link: https://web.facebook.com/groups/306983099399705/
Group name: Feirão do Face Picos-Pi
Group link: https://web.facebook.com/groups/3033157243647511/
Group n

## Web Scraping - posts grupos facebook

## Disponibilização google cloud

In [17]:
from google.cloud import storage
import os

def combine_txt_files(local_folder, combined_file_path):
    with open(combined_file_path, 'w') as combined_file:
        # Iterar sobre arquivos na pasta local
        unique_links = []
        first = True
        for filename in os.listdir(local_folder):
            if filename.endswith('.txt'):  # Filtrar apenas arquivos .txt
                file_path = os.path.join(local_folder, filename)
                firstime = True if not first else False
                first = False
                # Ler o conteúdo de cada arquivo .txt
                with open(file_path, 'r') as f:
                    for line in f:
                        if line not in unique_links:
                            unique_links.append(line)
                            if firstime:
                                firstime = False
                                combined_file.write("\n" + line)
                            combined_file.write(line)
                    #combined_file.write(f.read() + '\n')  # Adicionar quebra de linha entre os arquivos
                print(f'Adicionado {filename} ao arquivo combinado.')

def upload_file_to_gcs(file_path, bucket_name, destination_blob_name):
    # Instanciar cliente Google Cloud Storage
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'personal-portfolio-396715-c502ea581c84.json'
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Criar blob no bucket para o arquivo
    blob = bucket.blob(destination_blob_name)

    # Fazer upload do arquivo combinado
    blob.upload_from_filename(file_path)
    print(f'Uploaded {destination_blob_name} to {bucket_name}')

# Função principal que combina arquivos e faz upload
def combine_and_upload_txt_files(local_folder, bucket_name, destination_blob_name):
    combined_file_path = os.path.join(f'{local_folder}/temp', 'compilado.txt')

    # Combinar todos os arquivos .txt
    combine_txt_files(local_folder, combined_file_path)

    # Fazer upload do arquivo combinado para o Google Cloud Storage
    upload_file_to_gcs(combined_file_path, bucket_name, destination_blob_name)

    # Opcionalmente, remover o arquivo combinado localmente
    os.remove(combined_file_path)
    print(f'Arquivo combinado removido localmente: {combined_file_path}')


In [18]:
# Definir a pasta local e o nome do bucket
local_folder = "data/txt"
bucket_name = "disponibilizacao_dados"
destination_blob_name = "compilado_grupos_facebook.txt"

# Chamar a função para fazer o upload
combine_and_upload_txt_files(local_folder, bucket_name, destination_blob_name)


Adicionado grupos_the.txt ao arquivo combinado.
Adicionado grupos_Poti Velho.txt ao arquivo combinado.
Adicionado grupos_Angelim.txt ao arquivo combinado.
Adicionado grupos_Campo Maior.txt ao arquivo combinado.
Adicionado grupos_picos.txt ao arquivo combinado.
Adicionado grupos_luis correia.txt ao arquivo combinado.
Adicionado grupos_phb.txt ao arquivo combinado.
Adicionado grupos_piaui.txt ao arquivo combinado.
Adicionado grupos_Lourival Parente.txt ao arquivo combinado.
Adicionado grupos_Bom Jesus.txt ao arquivo combinado.
Adicionado grupos_teresina.txt ao arquivo combinado.
Adicionado grupos_santa maria da codipi.txt ao arquivo combinado.
Adicionado grupos_Piripiri.txt ao arquivo combinado.
Adicionado grupos_Alto Alegre.txt ao arquivo combinado.
Adicionado grupos_Vale do Gavião.txt ao arquivo combinado.
Adicionado grupos_Gurupi.txt ao arquivo combinado.
Adicionado grupos_parnaiba.txt ao arquivo combinado.
Adicionado grupos_Promorar.txt ao arquivo combinado.
Adicionado grupos_Macaúba