In [3]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# fetch text content
def fetch_text_content(base_url, relative_url):
    full_url = urljoin(base_url, relative_url)
    try:
        response = requests.get(full_url)
        response.raise_for_status()  # check for HTTP request errors
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {full_url}: {e}")
        return ""

    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text()

# base URL
base_url = 'https://www.haberturk.com'

# get URLs from urls.json
with open('urls.json', 'r') as file:
    data = json.load(file)

# only the unique URLs
unique_urls = data.get('unique_urls', [])
total_urls = len(unique_urls)

# dictionary to store the text content for each URL
text_data = {}

# process all URLs
for index, relative_url in enumerate(unique_urls):
    progress = f"({index + 1}/{total_urls})"
    full_url = urljoin(base_url, relative_url)
    print(f"{progress} Analyzing {full_url}...")
    
    text_content = fetch_text_content(base_url, relative_url)
    if text_content:
        text_data[full_url] = text_content
        print(f"{progress} Successfully fetched text from {full_url}")
    else:
        print(f"{progress} No text content found for {full_url}")

# write data to a JSON file
with open('texts.json', 'w', encoding='utf-8') as json_file:
    json.dump(text_data, json_file, ensure_ascii=False, indent=4)

# write data to an HTML file
with open('texts.html', 'w', encoding='utf-8') as html_file:
    html_file.write('<html><body>\n')
    for url, text in text_data.items():
        html_file.write(f'<h2>{url}</h2>\n')
        html_file.write(f'<pre>{text}</pre>\n')
    html_file.write('</body></html>\n')

print("All URLs processed and data saved to output.json and output.html.")


(1/536) Analyzing https://www.haberturk.com/images/common/manifest/180x180.png...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(1/536) Successfully fetched text from https://www.haberturk.com/images/common/manifest/180x180.png
(2/536) Analyzing https://www.haberturk.com/images/common/favicon/32x32.png...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(2/536) Successfully fetched text from https://www.haberturk.com/images/common/favicon/32x32.png
(3/536) Analyzing https://www.haberturk.com/images/common/favicon/16x16.png...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(3/536) Successfully fetched text from https://www.haberturk.com/images/common/favicon/16x16.png
(4/536) Analyzing https://www.haberturk.com/images/common/favicon/favicon.ico?v=001...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(4/536) Successfully fetched text from https://www.haberturk.com/images/common/favicon/favicon.ico?v=001
(5/536) Analyzing https://www.haberturk.com/manifest.webmanifest...
(5/536) Successfully fetched text from https://www.haberturk.com/manifest.webmanifest
(6/536) Analyzing https://im.haberturk.com...
(6/536) Successfully fetched text from https://im.haberturk.com
(7/536) Analyzing https://adsp.haberturk.com...
(7/536) Successfully fetched text from https://adsp.haberturk.com
(8/536) Analyzing https://im.hthayat.com...
Failed to retrieve https://im.hthayat.com: HTTPSConnectionPool(host='im.hthayat.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)')))
(8/536) No text content found for https://im.hthayat.com
(9/536) Analyzing https://geoim.bloomberght.com...
Failed to retrieve https://geoim.bloomberght.com: 403 Client Error: Forbidden for ur

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(22/536) Successfully fetched text from https://im.haberturk.com/l/2024/06/19/ver1718799771/3696380/jpg/320x640
(23/536) Analyzing https://im.haberturk.com/l/2024/06/20/ver1718831744/3696460/jpg/320x640...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(23/536) Successfully fetched text from https://im.haberturk.com/l/2024/06/20/ver1718831744/3696460/jpg/320x640
(24/536) Analyzing https://im.haberturk.com/l/2024/06/20/ver1718863122/3696489_manset/jpg/640x640...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(24/536) Successfully fetched text from https://im.haberturk.com/l/2024/06/20/ver1718863122/3696489_manset/jpg/640x640
(25/536) Analyzing https://im.haberturk.com/l/2024/06/20/ver1718858199/3696483_manset/jpg/640x640...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(25/536) Successfully fetched text from https://im.haberturk.com/l/2024/06/20/ver1718858199/3696483_manset/jpg/640x640
(26/536) Analyzing https://www.haberturk.com/adscbg/cbglout.js...
(26/536) Successfully fetched text from https://www.haberturk.com/adscbg/cbglout.js
(27/536) Analyzing https://www.haberturk.com/ekonomi...
(27/536) Successfully fetched text from https://www.haberturk.com/ekonomi
(28/536) Analyzing https://www.haberturk.com/ekonomi/altin...
(28/536) Successfully fetched text from https://www.haberturk.com/ekonomi/altin
(29/536) Analyzing https://www.haberturk.com/ekonomi/piyasa/88-dolar...
(29/536) Successfully fetched text from https://www.haberturk.com/ekonomi/piyasa/88-dolar
(30/536) Analyzing https://www.haberturk.com/ekonomi/piyasa/89-euro...
(30/536) Successfully fetched text from https://www.haberturk.com/ekonomi/piyasa/89-euro
(31/536) Analyzing https://www.haberturk.com/ekonomi/altin/610-gram-altin...
(31/536) Successfully fetched text from https://www.habertur