In [1]:
import requests
from bs4 import BeautifulSoup

BASE_URL = 'https://www.b9.com.br'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}

def get_soup(url):
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    response.encoding = 'utf-8'#test
    return BeautifulSoup(response.text, 'html.parser')

def extract_references(post_url):
    soup = get_soup(post_url)
    references_section = soup.find('p', text=lambda x: x and 'REFERÊNCIAS' in x)
    if not references_section:
        return []
    
    references = []
    for sibling in references_section.find_next_siblings():
        if sibling.get_text(strip=True) == '========':
            break
        references.append(sibling.get_text(strip=True))
    
    return references

In [2]:
# Test the function
post_url = 'https://www.b9.com.br/shows/naruhodo/naruhodo-418-o-que-e-a-birra/?highlight=naruhodo'
references = extract_references(post_url)
print(references)

['Assessment, management, and prevention of childhood temper tantrumshttps://journals.lww.com/jaanp/abstract/2012/10000/assessment,_management,_and_prevention_of.2.aspx', 'Temper Tantrums in Young Children: 2. Tantrum Duration and Temporal Organizationhttps://journals.lww.com/jrnldbp/fulltext/2003/06000/temper_tantrums_in_young_children__2__tantrum.3.aspx?casa_token=XT0dxgcDQJMAAAAA:KXBH6vF25IZT4vBlzGF3SysfHTm6XlWlcOFuAp_pcIfqXl2s_-yU_6pvKirSKoFbV8Y7jLlaqqq8zdLWV0W4NmaXTw', 'Temper Tantrums in Young Children: 1. Behavioral Compositionhttps://journals.lww.com/jrnldbp/fulltext/2003/06000/temper_tantrums_in_young_children__1__behavioral.2.aspx?casa_token=86hhrSeXMh0AAAAA:ZEF3NP81tjsathb5NVrGbcc08KdVqBjLNRBGr4pwZAkkRZszvPoUyZuTzdnwyRjirZ_ejI11i9YDHUVa3uNK1EAEOg', 'Meltdown/Tantrum Detection System for Individuals with Autism Spectrum Disorderhttps://www.tandfonline.com/doi/full/10.1080/08839514.2021.1991115', 'Developmental pathways from preschool temper tantrums to later psychopathologyht

In [4]:
import time
import csv

SEARCH_URL = 'https://www.b9.com.br/?s=naruhodo&pagina={}'

def get_podcast_posts(page_number):
    soup = get_soup(SEARCH_URL.format(page_number))
    return [a['href'] for a in soup.select('a.c-post-card__link')]

def scrape_references():
    all_references = []
    for page in range(1,2):  # Loop through all pages
        print(f'Scraping page {page}...')
        post_links = get_podcast_posts(page)
        for post_link in post_links:
            print(f'Scraping post {post_link}...')
            references = extract_references(post_link)
            all_references.append([post_link] + references)
            time.sleep(1)  # Be polite and don't overwhelm the server
    return all_references

def save_to_csv(data, filename='references.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for row in data:
            writer.writerow(row)

if __name__ == "__main__":
    references = scrape_references()
    save_to_csv(references)
    print("Data has been saved to references.csv")

Scraping page 1...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-425-o-que-e-competitividade-parte-2-de-2/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-424-o-que-e-competitividade-parte-1-de-2/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-422-criancas-acreditam-em-contos-de-fadas/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-421-por-que-guardamos-segredos/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-420-maconha-faz-mal-parte-2-de-2/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-419-maconha-faz-mal-parte-1-de-2/?highlight=naruhodo...
Data has been saved to references.csv


In [5]:
import pandas as pd

# Step 1: Read and Process the CSV File
file_path = 'references.csv'
data = pd.read_csv(file_path, header=None, sep=',')

# Adjust column names to set the first column as 'Episode'
data.columns = [f'Reference_{i}' if i > 1 else 'Episode' for i in range(1, len(data.columns) + 1)]

# Set the first column ('Episode') as the index
data.set_index('Episode', inplace=True)

# Transpose the DataFrame to make episodes as columns
data_transposed = data.transpose()

# Fill empty values with an empty string
data_transposed.fillna(value='', inplace=True)

# Export the processed DataFrame to a CSV file
export_path = 'path_to_your_file/processed_references.csv'
data_transposed.to_csv(export_path, index=False)


ParserError: Error tokenizing data. C error: Expected 19 fields in line 4, saw 25


In [16]:
from neo4j import GraphDatabase
import csv

# Neo4j connection details
uri = "bolt://localhost:7687"
username = "neo4j"
password = "senha123"

# Connect to Neo4j
driver = GraphDatabase.driver(uri, auth=(username, password))

def load_data(filename='references.csv'):
    data = []
    with open(filename, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append(row)
    return data

def create_graph(tx, data):
    for row in data:
        episode = row[0]
        references = row[1:]
        
        # Create or merge the episode node
        tx.run("MERGE (e:Episode {url: $episode})", episode=episode)
        
        for ref in references:
            # Create or merge the reference node
            tx.run("MERGE (r:Reference {url: $ref})", ref=ref)
            # Create the relationship
            tx.run("""
                MATCH (e:Episode {url: $episode})
                MATCH (r:Reference {url: $ref})
                MERGE (e)-[:REFERENCES]->(r)
            """, episode=episode, ref=ref)

def main():
    data = load_data()
    with driver.session() as session:
        session.write_transaction(create_graph, data)
    print("Data has been imported into Neo4j")

if __name__ == "__main__":
    main()


  session.write_transaction(create_graph, data)


Data has been imported into Neo4j
