<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from neo4j import GraphDatabase

In [2]:
# Phase 1: Web Scraping Functions

SEARCH_URL = 'https://www.b9.com.br/?s=naruhodo&pagina={}'

def get_soup(url):
    """
    Fetches the webpage content and returns the BeautifulSoup object.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

def extract_references(post_url):
    """
    Extracts episode name, URL, and references from a given podcast episode URL.
    """
    soup = get_soup(post_url)
    if not soup:
        return {'Episode Name': 'Unknown Episode', 'Episode URL': post_url, 'References': []}

    episode_name_tag = soup.find('span', {'class': 'c-play-btn'})
    episode_name_text = episode_name_tag['data-name'] if episode_name_tag and 'data-name' in episode_name_tag.attrs else 'Unknown Episode'

    references_section = soup.find('p', text=lambda x: x and 'REFERÊNCIAS' in x)
    if not references_section:
        return {'Episode Name': episode_name_text, 'Episode URL': post_url, 'References': []}

    references = []
    for sibling in references_section.find_next_siblings():
        if sibling.get_text(strip=True) == '========':
            break
        link = sibling.find('a', href=True)
        if link:
            reference_name = sibling.get_text(strip=True).replace(link.get_text(strip=True), '').strip()
            reference_url = link['href']
            references.append({'name': reference_name, 'url': reference_url})

    return {'Episode Name': episode_name_text, 'Episode URL': post_url, 'References': references}

def get_podcast_posts(page_number):
    """
    Extracts podcast post links from the search results page.
    """
    soup = get_soup(SEARCH_URL.format(page_number))
    return [a['href'] for a in soup.select('a.c-post-card__link')]

def scrape_references():
    """
    Scrapes all episodes and their references, organizing the data for further use.
    """
    all_references = []
    
    for page in range(1, 18):  # Adjust the range for the total number of pages
        print(f'Scraping page {page}...')
        post_links = get_podcast_posts(page)
        
        for post_link in post_links:
            print(f'Scraping post {post_link}...')
            episode_data = extract_references(post_link)
            references_dict = {
                'Episode': episode_data['Episode Name'],
                'Episode URL': episode_data['Episode URL']
            }
            
            # Dynamically add references as columns
            for i, ref in enumerate(episode_data['References']):
                references_dict[f'Reference{i+1}'] = ref['name']
                references_dict[f'Reference{i+1}URL'] = ref['url']
            
            all_references.append(references_dict)
            time.sleep(1)  # Polite scraping
    
    return all_references

In [3]:
# Test the function
post_url = 'https://www.b9.com.br/shows/naruhodo/naruhodo-196-por-que-colecionamos-coisas/?highlight=naruhodo'
references = extract_references(post_url)
print(references)

{'Episode Name': 'Naruhodo #196 – Por que colecionamos coisas?', 'Episode URL': 'https://www.b9.com.br/shows/naruhodo/naruhodo-196-por-que-colecionamos-coisas/?highlight=naruhodo', 'References': [{'name': 'The Influence of Initial Possession Level on Consumers’ Adoption of a Collection Goal: A Tipping Point Effect (2014)', 'url': 'https://journals.sagepub.com/doi/pdf/10.1509/jm.13.0475'}, {'name': 'Wanting Ever More: Acquisition Procedure Motivates Continued Reward Acquisition', 'url': 'http://academic.oup.com.sci-hub.tw/jcr/article-abstract/43/2/230/2572282'}, {'name': 'Experimental Tests of the Endowment Effect and the Coase Theorem', 'url': 'http://www.journals.uchicago.edu.sci-hub.tw/doi/10.1086/261737'}, {'name': 'The Neural Basis of Financial Risk Taking', 'url': 'https://www.cell.com/neuron/fulltext/S0896-6273(05)00657-4?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0896627305006574%3Fshowall%3Dtrue'}, {'name': 'Neurofinance', 'url': 'https://journals.sage

In [4]:
# Phase 2: Data Storage Functions

def save_to_dataframe(data):
    """
    Converts scraped data into a Pandas DataFrame for easier manipulation.
    """
    if data:
        return pd.DataFrame(data)
    else:
        print("No data to save to DataFrame.")
        return pd.DataFrame()

In [5]:
# Phase 3: Neo4j Graph Database Functions

class Neo4jHandler:
    def __init__(self, uri, username, password):
        self.driver = GraphDatabase.driver(uri, auth=(username, password))

    def clear_database(self):
        """
        Clears all nodes and relationships in the Neo4j database.
        """
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
        print("Database cleared.")

    def create_graph(self, data):
        """
        Inserts episodes and references into the Neo4j graph database.
        """
        with self.driver.session() as session:
            for index, row in data.iterrows():
                episode_name = row['Episode']
                episode_url = row['Episode URL']
                
                if pd.isna(episode_name) or pd.isna(episode_url):
                    print(f"Skipping episode at row {index} due to missing name or URL.")
                    continue
                
                # Create or merge the episode node
                session.run("""
                    MERGE (e:Episode {name: $episode_name, url: $episode_url})
                """, episode_name=episode_name, episode_url=episode_url)
                
                # Loop through references
                for i in range(1, 100):
                    reference_name_col = f'Reference{i}'
                    reference_url_col = f'Reference{i}URL'
                    
                    if reference_name_col in row and reference_url_col in row and pd.notna(row[reference_name_col]) and pd.notna(row[reference_url_col]):
                        reference_name = row[reference_name_col]
                        reference_url = row[reference_url_col]
                        
                        if "Naruhodo" in reference_name:
                            # Treat as an episode with bidirectional relationship
                            session.run("""
                                MERGE (r:Episode {name: $reference_name, url: $reference_url})
                            """, reference_name=reference_name, reference_url=reference_url)
                            
                            session.run("""
                                MATCH (e:Episode {name: $episode_name, url: $episode_url})
                                MATCH (r:Episode {name: $reference_name, url: $reference_url})
                                MERGE (e)-[:REFERENCED]->(r)
                                MERGE (r)-[:REFERENCED]->(e)
                            """, episode_name=episode_name, episode_url=episode_url, reference_name=reference_name, reference_url=reference_url)
                        else:
                            # Create reference node
                            session.run("""
                                MERGE (r:Reference {name: $reference_name, url: $reference_url})
                            """, reference_name=reference_name, reference_url=reference_url)
                            
                            session.run("""
                                MATCH (e:Episode {name: $episode_name, url: $episode_url})
                                MATCH (r:Reference {name: $reference_name, url: $reference_url})
                                MERGE (e)-[:REFERENCED]->(r)
                            """, episode_name=episode_name, episode_url=episode_url, reference_name=reference_name, reference_url=reference_url)

    def close(self):
        """
        Closes the Neo4j connection.
        """
        self.driver.close()

In [6]:
# Phase 4: Main Process and Tests

def run_scraping_and_neo4j_import():
    """
    Runs the full scraping and Neo4j import process.
    """
    # Step 1: Scrape references
    all_episodes_data = scrape_references()
    df = save_to_dataframe(all_episodes_data)

    # Step 2: Insert into Neo4j
    neo4j_handler = Neo4jHandler(uri="bolt://localhost:7687", username="neo4j", password="senha123")
    neo4j_handler.clear_database()
    neo4j_handler.create_graph(df)
    neo4j_handler.close()

# Example usage
if __name__ == "__main__":
    run_scraping_and_neo4j_import()


Scraping page 1...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-425-o-que-e-competitividade-parte-2-de-2/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-424-o-que-e-competitividade-parte-1-de-2/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-422-criancas-acreditam-em-contos-de-fadas/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-421-por-que-guardamos-segredos/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-420-maconha-faz-mal-parte-2-de-2/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-419-maconha-faz-mal-parte-1-de-2/?highlight=naruhodo...
Scraping page 2...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-411-por-que-traimos-parte-1-de-2/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-410-por-que-caes-correm-atras-de-veiculos-com-rodas/?highlight=naru

Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-292-por-que-mexemos-as-maos-quando-falamos/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-290-o-que-e-e-para-que-serve-o-teste-de-rorschach/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-289-ficamos-parecidos-com-nossos-pais-quando-envelhecemos/?highlight=naruhodo...
Scraping page 12...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-282-anotar-a-mao-e-melhor-que-com-computador/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-281-aprendemos-mais-quando-somos-punidos/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-280-por-que-as-pessoas-compartilham-fake-news/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-279-o-poder-corrompe/?highlight=naruhodo...
Scraping post https://www.b9.com.br/shows/naruhodo/naruhodo-278-o-que-e-singularidade-parte-2-d