## prompt

https://www.samsung.com/fr/ 와 https://www.samsung.com/ca_fr/ 사이트의 링크를 순회하면서 html 문서를 읽어와서 텍스트 파일만 수집하는 코드를 작성하고 해당 사이트에 있는 url , text,  항목으로  sqlite 데이터 베이스를 만들어서 저장하고 수집하는 과정을 확인하는 코드도 작성할것 테이블에 ca, ca_fr을 구분하는 컬럼도 추가할 것

In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import time
from urllib.parse import urljoin

def create_database():
    conn = sqlite3.connect('samsung_data.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS samsung_pages
                 (url TEXT PRIMARY KEY, text TEXT, country TEXT)''')
    conn.commit()
    return conn

def insert_data(conn, url, text, country):
    c = conn.cursor()
    c.execute("INSERT OR REPLACE INTO samsung_pages (url, text, country) VALUES (?, ?, ?)",
              (url, text, country))
    conn.commit()

def crawl_website(base_url, country):
    visited = set()
    to_visit = [base_url]
    
    conn = create_database()

    while to_visit:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        try:
            response = requests.get(current_url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Extract text
                text = ' '.join([p.get_text(strip=True) for p in soup.find_all('p')])
                
                # Insert data into database
                insert_data(conn, current_url, text, country)
                
                print(f"Crawled: {current_url}")

                # Find new links
                for link in soup.find_all('a', href=True):
                    new_url = urljoin(base_url, link['href'])
                    if new_url.startswith(base_url) and new_url not in visited:
                        to_visit.append(new_url)

                visited.add(current_url)
            
            time.sleep(1)  # Be respectful to the server
        except Exception as e:
            print(f"Error crawling {current_url}: {e}")

    conn.close()

def verify_data():
    conn = sqlite3.connect('samsung_data.db')
    c = conn.cursor()
    
    # Count total entries
    c.execute("SELECT COUNT(*) FROM samsung_pages")
    total_entries = c.fetchone()[0]
    print(f"Total entries in database: {total_entries}")

    # Count entries for each country
    c.execute("SELECT country, COUNT(*) FROM samsung_pages GROUP BY country")
    country_counts = c.fetchall()
    for country, count in country_counts:
        print(f"Entries for {country}: {count}")

    # Display a few sample entries
    c.execute("SELECT * FROM samsung_pages LIMIT 5")
    sample_entries = c.fetchall()
    print("\nSample entries:")
    for entry in sample_entries:
        print(f"URL: {entry[0]}")
        print(f"Text preview: {entry[1][:100]}...")
        print(f"Country: {entry[2]}")
        print("---")

    conn.close()

# Main execution
if __name__ == "__main__":
    france_url = "https://www.samsung.com/fr/"
    canada_fr_url = "https://www.samsung.com/ca_fr/"

    print("Crawling Samsung France website...")
    crawl_website(france_url, "fr")

    print("\nCrawling Samsung Canada (French) website...")
    crawl_website(canada_fr_url, "ca_fr")

    print("\nVerifying collected data...")
    verify_data()

In [None]:
print("\nCrawling Samsung Canada (French) website...")
crawl_website(canada_fr_url, "ca_fr")