# Collecting Turkish website URLs
We have WARC files and news websites list from Basin Ilan Kurumu.\
First we will try to find websites that have Turkish Content from WARC files. \
Then we will merge this data with BIK's list.

### Install required packages

In [10]:
pip install warcio tqdm

Note: you may need to restart the kernel to use updated packages.


### Constants

In [11]:
import os

data_folder = './data'
warc_files = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.warc')]
print("WARC dosyaları:", warc_files)

WARC dosyaları: ['./data/CC-NEWS-20250103095544-00180.warc', './data/CC-NEWS-20250102092617-00168.warc', './data/CC-NEWS-20250101055537-00157.warc', './data/CC-NEWS-20250101233509-00164.warc', './data/CC-NEWS-20250102214004-00175.warc', './data/CC-NEWS-20250103181014-00185.warc', './data/CC-NEWS-20250103145501-00183.warc', './data/CC-NEWS-20250101110352-00159.warc', './data/CC-NEWS-20250102074538-00167.warc', './data/CC-NEWS-20250102110149-00169.warc', './data/CC-NEWS-20200110212037-00310.warc', './data/CC-NEWS-20250102140759-00171.warc', './data/CC-NEWS-20250102053830-00166.warc', './data/CC-NEWS-20250103060152-00178.warc', './data/CC-NEWS-20250101182853-00162.warc', './data/CC-NEWS-20250102155145-00172.warc', './data/CC-NEWS-20250101204758-00163.warc', './data/CC-NEWS-20250103080404-00179.warc', './data/CC-NEWS-20250102192440-00174.warc', './data/CC-NEWS-20250102122438-00170.warc', './data/CC-NEWS-20250102173620-00173.warc', './data/CC-NEWS-20250103114344-00181.warc', './data/CC-NEWS

### Collect URLs that have Turkish content

In [None]:
import re
from tqdm import tqdm
from warcio.archiveiterator import ArchiveIterator
from urllib.parse import urlparse

def is_turkish_text(content, min_unique=3, debug=False):
    """
    İçerikteki Türkçe karakterler arasında, farklı (unique) karakter sayısı min_unique'ten fazla ise
    içerik Türkçe kabul edilir.
    """
    # Türkçe karakterler: ç, ğ, ı, ö, ş, ü (büyük ve küçük)
    turkish_chars = "çğıöşüÇĞİÖŞÜ"
    matches = re.findall(f"[{turkish_chars}]", content)
    unique_chars = set(matches)
    if debug:
        print(f"Bulunan karakterler: {matches}")
        print(f"Unique karakterler: {unique_chars}")
    return len(unique_chars) > min_unique

def list_turkish_urls_by_chars(warc_file, read_bytes=20000, debug=False):
    """Bir WARC dosyasındaki, içerikte Türkçe karakterlerin varlığına bakarak Türkçe URL'leri döndürür."""
    turkish_urls = []
    try:
        with open(warc_file, 'rb') as f:
            for record in ArchiveIterator(f):
                if record.rec_type == 'response':
                    try:
                        content = record.content_stream().read(read_bytes).decode('utf-8', errors='ignore')
                        if content.strip() and is_turkish_text(content, debug=debug):
                            url = record.rec_headers.get_header('WARC-Target-URI')
                            if url:
                                turkish_urls.append(url)
                    except Exception as error:
                        if debug:
                            print("Record işlenirken hata:", error)
                        continue
    except Exception as e:
        print(f"Error processing {warc_file}: {e}")
    return turkish_urls

# WARC dosyalarını işleyip Türkçe URL'leri toplayalım
all_turkish_urls = []
all_turkish_urls_counts = {}

for warc_file in tqdm(warc_files, desc="Processing WARC files"):
    urls = list_turkish_urls_by_chars(warc_file, debug=False)
    all_turkish_urls_counts[warc_file] = len(urls)
    all_turkish_urls.extend(urls)

print(f"\nToplamda {len(all_turkish_urls)} adet Türkçe URL bulundu.")
print("\nWARC dosyalarındaki Türkçe URL sayıları:\n", all_turkish_urls_counts)

# all_turkish_urls listesinde bulunan tüm URL'lerden unique domain'leri alalım
unique_domains = {urlparse(url).netloc for url in all_turkish_urls if url}

print("Unique domains:", unique_domains)
for domain in unique_domains:
    print(domain)

# unique_domains değişkeninin oluşturulmuş olduğunu varsayıyoruz.
with open('./websites/warc-turkish-url-list.txt', 'w', encoding='utf-8') as f:
    for domain in sorted(unique_domains):
        f.write(domain + "\n")

print(f"Exported {len(unique_domains)} domains to warc-turkish-url-list.txt")



Processing WARC files:  90%|█████████ | 28/31 [10:29<01:09, 23.22s/it]

### Merge WARC URLs with BIK URLs