In [3]:
from bs4 import BeautifulSoup
import json
import re

# Load the HTML content from a text file
file_path = 'C:/Users/User/Documents/Pubrio_website_content.txt'

# Read the content of the file with utf-8 encoding
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()


# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Function to extract the main topic of the article
def extract_topic(soup):
    # Attempt to find the article's main topic using common HTML tags
    if soup.find('h1'):
        return soup.h1.get_text().strip()
    elif soup.find('h2'):
        return soup.h2.get_text().strip()
    elif soup.title:
        return soup.title.get_text().strip()
    else:
        return "Topic not found"

# Function to find potential company links based on <a> tags
def find_potential_company_links(soup):
    company_links = []
    for a in soup.find_all('a', href=True):
        text = a.get_text(strip=True)
        href = a['href']
        if text.istitle() and "http" in href and "techcrunch.com" not in href:
            company_links.append({'company_name': text, 'company_domain': href})
    return company_links

# Function to clean and extract domain names from URLs
def extract_domain(url):
    domain_match = re.search(r'https?://(www\.)?([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', url)
    return domain_match.group(2) if domain_match else url

# Extract the topic
topic = extract_topic(soup)

# Find all potential company links
potential_company_links = find_potential_company_links(soup)

# Filter and clean up the company links
valid_company_links = []
for link in potential_company_links:
    # Exclude any entries that don't look like company names
    if link['company_name'] not in ['October 27, 2023']:  # Add more filters as needed
        link['company_domain'] = extract_domain(link['company_domain'])
        valid_company_links.append(link)

# Prepare the final JSON data
final_json_data = {
    "related_companies": valid_company_links,
    "topic": topic
}

# Convert the Python dictionary to a JSON string
final_json_string = json.dumps(final_json_data, indent=4)

# Output the final JSON data to a file
final_json_path = 'C:/Users/User/Documents/extracted_data.json'
with open(final_json_path, 'w') as json_file:
    json_file.write(final_json_string)

# Print the final JSON data
print(final_json_string)


{
    "related_companies": [
        {
            "company_name": "Bloomberg",
            "company_domain": "bloomberg.com"
        },
        {
            "company_name": "Reuters",
            "company_domain": "reuters.com"
        }
    ],
    "topic": "X is launching two new subscription tiers, including a \u2018Premium+\u2019 ad-free plan"
}
