# Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import pdfplumber  # For extracting text from PDFs

## 1. Extract Boardgame Names and Rulebook Links

In [2]:
# Base URL of the website
base_url = "https://en.1jour-1jeu.com/rules?page="

# Directory to save downloaded rulebooks
os.makedirs("rulebooks", exist_ok=True)

# Initialize list for storing data
boardgames_data = []

# Number of pages (update this after checking total pages on the website)
num_pages = 394

In [3]:
# Step 1: Scrape boardgame names and rulebook links
for page in range(1, num_pages + 1):
    url = base_url + str(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all boardgame entries
    games = soup.find_all("h3", class_="mb-0")      
    for game in games:
        name = game.text.strip()  # Extract boardgame name
        parent = game.find_parent("div", class_="col-center")
        link = parent.find("a", href=True) 
        
        if link:
            rulebook_url = link["href"]
            boardgames_data.append({"Name": name, "Rulebook_URL": rulebook_url})

# Convert to a DataFrame
df = pd.DataFrame(boardgames_data)

# Step 2: Download rulebooks and extract text
df["Rulebook_Text"] = "" 

for i, row in df.iterrows():
    rulebook_url = row["Rulebook_URL"]
    try:
        # Download the PDF
        response = requests.get(rulebook_url)
        file_path = f"rulebooks/{row['Name'].replace(' ', '_')}.pdf"
        with open(file_path, "wb") as f:
            f.write(response.content)
        
        # Extract text from PDF
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""  # Extract text from each page
            df.at[i, "Rulebook_Text"] = text.strip()
    except Exception as e:
        print(f"Error processing {row['Name']}: {e}")

# Step 3: Save final dataset
df.drop(columns=["Rulebook_URL"], inplace=True)
df.to_csv("boardgames_with_rulebooks.csv", index=False)

print("Dataset created successfully!")

Error processing À l'Heure du Crime... Où Étiez-Vous ? Règle: [Errno 22] Invalid argument: "rulebooks/À_l'Heure_du_Crime..._Où_Étiez-Vous_?_Règle.pdf"
Error processing BANG ! High Noon/A Fistful of Cards Rulebook: [Errno 2] No such file or directory: 'rulebooks/BANG_!_High_Noon/A_Fistful_of_Cards_Rulebook.pdf'
Error processing C'est qui le Plus Fort ? Règle: [Errno 22] Invalid argument: "rulebooks/C'est_qui_le_Plus_Fort_?_Règle.pdf"
Error processing Cluedo: 50 - 1949 / 1999 Règle: [Errno 22] Invalid argument: 'rulebooks/Cluedo:_50_-_1949_/_1999_Règle.pdf'
Error processing Cluedo: Junior: L'affaire des Jouets Cachés ! Règle: [Errno 22] Invalid argument: "rulebooks/Cluedo:_Junior:_L'affaire_des_Jouets_Cachés_!_Règle.pdf"
Error processing Conjudingo CM1/CM2 Règle: [Errno 2] No such file or directory: 'rulebooks/Conjudingo_CM1/CM2_Règle.pdf'
Error processing Crazy Tower: Construction / Sabotage Règle: [Errno 22] Invalid argument: 'rulebooks/Crazy_Tower:_Construction_/_Sabotage_Règle.pdf'
E