# Parsing WARC File

### ðŸ›  Installing Necessary Libraries

In [1]:
pip install warcio beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
from warcio.archiveiterator import ArchiveIterator
from bs4 import BeautifulSoup
import sqlite3
import time

def process_warc_file_in_chunks(warc_path, db_path, batch_size=100):
    """
    WARC dosyasÄ±nÄ± parÃ§a parÃ§a iÅŸleyerek SQLite'a kaydeder
    
    Args:
        warc_path: WARC dosyasÄ±nÄ±n yolu
        db_path: SQLite veritabanÄ±nÄ±n yolu
        batch_size: Her defasÄ±nda iÅŸlenecek kayÄ±t sayÄ±sÄ±
    """
    # SQLite baÄŸlantÄ±sÄ±nÄ± oluÅŸtur
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    
    # Tablo oluÅŸtur (eÄŸer yoksa)
    c.execute('CREATE TABLE IF NOT EXISTS pages (id INTEGER PRIMARY KEY, url TEXT, title TEXT, html TEXT)')
    
    # Ä°ÅŸleme istatistikleri
    start_time = time.time()
    total_records = 0
    batch_records = []
    
    # WARC dosyasÄ±nÄ± aÃ§ ve kayÄ±tlarÄ± iÅŸle
    with open(warc_path, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response':
                try:
                    url = record.rec_headers.get_header('WARC-Target-URI')
                    raw_html = record.content_stream().read().decode('utf-8', errors='replace')
                    
                    soup = BeautifulSoup(raw_html, 'html.parser')
                    title = soup.title.string if soup.title else ''
                    
                    batch_records.append((url, title, raw_html))
                    total_records += 1
                    
                    # Belirli bir sayÄ±da kayÄ±t toplandÄ±ysa veritabanÄ±na kaydet
                    if len(batch_records) >= batch_size:
                        c.executemany('INSERT INTO pages (url, title, html) VALUES (?, ?, ?)', batch_records)
                        conn.commit()
                        
                        elapsed_time = time.time() - start_time
                        print(f"Ä°ÅŸlenen kayÄ±t: {total_records}, GeÃ§en sÃ¼re: {elapsed_time:.2f} saniye")
                        
                        # BelleÄŸi temizle
                        batch_records = []
                
                except Exception as e:
                    print(f"Hata oluÅŸtu: {e} - URL: {url if 'url' in locals() else 'bilinmiyor'}")
    
    # Kalan kayÄ±tlarÄ± kaydet
    if batch_records:
        c.executemany('INSERT INTO pages (url, title, html) VALUES (?, ?, ?)', batch_records)
        conn.commit()
    
    # Ä°statistikleri gÃ¶ster
    total_time = time.time() - start_time
    print(f"Toplam iÅŸlenen kayÄ±t: {total_records}")
    print(f"Toplam geÃ§en sÃ¼re: {total_time:.2f} saniye")
    print(f"Saniyede ortalama kayÄ±t: {total_records/total_time:.2f}")
    
    conn.close()

# Kodu Ã§alÄ±ÅŸtÄ±r
warc_file = "./website_data.warc"
db_file = "websites.db"
process_warc_file_in_chunks(warc_file, db_file, batch_size=500)

Ä°ÅŸlenen kayÄ±t: 500, GeÃ§en sÃ¼re: 49.11 saniye
Ä°ÅŸlenen kayÄ±t: 1000, GeÃ§en sÃ¼re: 92.11 saniye
Hata oluÅŸtu: The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 AssertionError: expected name token at '<![ï¿½ï¿½Pï¿½ï¿½ï¿½aï¿½9\x02l~Qï¿½ï¿½ï¿½ï¿½' - URL: https://files.dainikshiksha.com/136301/conversions/najirpur-thumb.webp
Ä°ÅŸlenen kayÄ±t: 1500, GeÃ§en sÃ¼re: 139.04 saniye
Ä°ÅŸlenen kayÄ±t: 2000, GeÃ§en sÃ¼re: 181.75 saniye
Ä°ÅŸlenen kayÄ±t: 2500, GeÃ§en sÃ¼re: 208.13 saniye
Ä°ÅŸlenen kayÄ±t: 3000, GeÃ§en sÃ¼re: 232.13 saniye
Ä°ÅŸlenen kayÄ±t: 3500, GeÃ§en sÃ¼re: 254.73 saniye
Ä°ÅŸlenen kayÄ±t: 4000, GeÃ§en sÃ¼re: 278.45 saniye
Ä°ÅŸlenen kayÄ±t: 4500, GeÃ§en sÃ¼re: 300.51 saniye
Ä°ÅŸlenen kayÄ±t: 5000, GeÃ§en sÃ¼re: 322.71 saniye
Ä°ÅŸlenen kayÄ±t: 5500, GeÃ§en sÃ¼re: 365.81 saniye
Ä°ÅŸlenen kayÄ±t: 6000, GeÃ§en sÃ¼re: 410.47 saniye
Ä°ÅŸlenen kayÄ±t: 6500, GeÃ§en sÃ¼re: 450.23 saniye
Ä

  k = self.parse_starttag(i)


Ä°ÅŸlenen kayÄ±t: 8500, GeÃ§en sÃ¼re: 584.87 saniye
Ä°ÅŸlenen kayÄ±t: 9000, GeÃ§en sÃ¼re: 612.02 saniye
Ä°ÅŸlenen kayÄ±t: 9500, GeÃ§en sÃ¼re: 660.96 saniye
Ä°ÅŸlenen kayÄ±t: 10000, GeÃ§en sÃ¼re: 692.42 saniye
Ä°ÅŸlenen kayÄ±t: 10500, GeÃ§en sÃ¼re: 717.93 saniye
Ä°ÅŸlenen kayÄ±t: 11000, GeÃ§en sÃ¼re: 740.11 saniye
Ä°ÅŸlenen kayÄ±t: 11500, GeÃ§en sÃ¼re: 756.56 saniye
Ä°ÅŸlenen kayÄ±t: 12000, GeÃ§en sÃ¼re: 769.75 saniye
Ä°ÅŸlenen kayÄ±t: 12500, GeÃ§en sÃ¼re: 783.53 saniye
Ä°ÅŸlenen kayÄ±t: 13000, GeÃ§en sÃ¼re: 798.41 saniye
Ä°ÅŸlenen kayÄ±t: 13500, GeÃ§en sÃ¼re: 818.17 saniye
Ä°ÅŸlenen kayÄ±t: 14000, GeÃ§en sÃ¼re: 836.72 saniye
Ä°ÅŸlenen kayÄ±t: 14500, GeÃ§en sÃ¼re: 856.90 saniye
Ä°ÅŸlenen kayÄ±t: 15000, GeÃ§en sÃ¼re: 874.91 saniye
Ä°ÅŸlenen kayÄ±t: 15500, GeÃ§en sÃ¼re: 891.80 saniye
Ä°ÅŸlenen kayÄ±t: 16000, GeÃ§en sÃ¼re: 906.40 saniye
Ä°ÅŸlenen kayÄ±t: 16500, GeÃ§en sÃ¼re: 926.27 saniye
Ä°ÅŸlenen kayÄ±t: 17000, GeÃ§en sÃ¼re: 945.42 saniye
Ä°ÅŸlenen kayÄ±t: 17500, GeÃ§en sÃ¼re: 961.74 san