In [None]:
!pip install requests beautifulsoup4

In [3]:
import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin
import time
import random
import json
from datetime import datetime

class JibayaMetadataCollector:
    """
    A class to collect metadata about tax documents from the JIBAYA website,
    save it to a JSON file, and then download the documents based on that metadata.
    """
    
    def __init__(self, docs_url="http://jibaya.tn/docs/", min_year=2021, 
                 output_folder="jibaya_documents", metadata_file="jibaya_metadata.json", 
                 verify_ssl=False, skip_unknown_year=True):
        """
        Initialize the collector with necessary parameters.
        
        Args:
            docs_url: The URL of the JIBAYA docs page
            min_year: The minimum year for documents to include
            output_folder: The folder where documents will be saved
            metadata_file: The JSON file to store document metadata
            verify_ssl: Whether to verify SSL certificates
            skip_unknown_year: Whether to skip documents with unknown years
        """
        self.docs_url = docs_url
        self.min_year = min_year
        self.output_folder = output_folder
        self.metadata_file = metadata_file
        self.verify_ssl = verify_ssl
        self.skip_unknown_year = skip_unknown_year
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # Create output folder if it doesn't exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        
        # Keep track of processed URLs to avoid duplicates
        self.processed_urls = set()
    
    def _extract_year_from_text(self, text):
        """Extract the biggest year (e.g. 20xx) from text using regex."""
        # Look for years in format 20XX or years in Arabic numerals or with different formats
        year_patterns = [
            r'\b(20\d{2})\b',              # Standard years like 2021
            r'\b(٢٠\d{2})\b',              # Arabic numeral years
            r'(?:année|year)[^\d]*(20\d{2})',  # Years preceded by "année" or "year"
            r'[^\d]*(20\d{2})[^\d]*(?:fiscale|fiscal)'  # Years associated with fiscal periods
        ]
        
        years = []
        for pattern in year_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                try:
                    if isinstance(match, tuple):
                        match = match[0]  # If capture groups create tuples
                    # Convert to integer if it's a string of digits
                    if re.match(r'^\d+$', match):
                        years.append(int(match))
                except ValueError:
                    continue
        
        if years:
            return max(years)
        return None
    
    def _get_document_type(self, text):
        """Determine the document type based on text content using priority rules."""
        text_lower = text.lower()
        
        # Priority-based classification to prevent duplicates
        
        # 1. Check for notes communes first
        if ('note' in text_lower or 'commun' in text_lower) or ('عامة' in text_lower or 'مذكرة' in text_lower):
            return "notes_communes"
            
        # 2. Check for codes and receuils
        elif ('code' in text_lower or 'receuil' in text_lower or 'cdpf' in text_lower):
            return "receuils"
            
        # 3. Check for lois de finances
        elif ('loi' in text_lower or 'financ' in text_lower):
            return "lois_de_finances"
            
        # 4. Check for conventions - extensive list of countries
        countries = [
            'yemen', 'vietnam', 'uma', 'turquie', 'syrie', "sultanat d'oman", 'suisse', 'suede', 'sudan', 'slovaquie',
            'serbie', 'senegal', 'royaume uni', 'roumanie', 'republique tcheque', 'qatar', 'portugal', 'pologne',
            'pays bas', 'pakistan', 'norvege', 'mauritanie', 'maroc', 'malte', 'mali', 'lybie', 'luxemburg', 'liban',
            'koweit', 'jordanie', 'italie', 'iran', 'indonesie', 'ile de maurice', 'hongrie', 'grece', 'france',
            'ethiopie', 'espagne', 'emirates arabes unis', 'egypte', 'danemark', "cote d ivoire", 'coree du sud',
            'chine', 'canada', 'cameroun', 'burkinafaso', 'belgique', 'autriche', 'arabie saoudite', 'amerique',
            'allemagne', 'algerie', 'afrique de sud'
        ]
        
        if any(f" {country} " in f" {text_lower} " for country in countries):
            return "conventions"
            
        # 5. Default category - will be skipped
        return "autres_documents"
    
    def collect_metadata(self):
        """
        Collect metadata about all relevant documents and save to a JSON file.
        
        Returns:
            A dictionary with document metadata
        """
        documents = []
        
        try:
            print(f"Accessing docs page: {self.docs_url}")
            response = self.session.get(self.docs_url, verify=self.verify_ssl)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Process direct PDF links
            self._process_page_for_metadata(soup, self.docs_url, documents)
            
            # Find and follow links that might lead to pages with PDFs
            for link in soup.find_all('a'):
                if not link.has_attr('href'):
                    continue
                    
                href = link['href']
                link_text = link.get_text().strip()
                
                # Skip irrelevant links
                if not link_text or href.startswith('#') or href.lower().endswith('.pdf'):
                    continue
                
                # Check if this might be a page with documents
                might_be_doc_page = any(term in link_text.lower() for term in [
                    'loi', 'décret', 'circulaire', 'note', 'document', 'texte', 
                    'fiscal', 'impôt', 'finance', 'catégorie', 'année'
                ])
                
                if might_be_doc_page:
                    try:
                        # Follow the link
                        doc_url = urljoin(self.docs_url, href)
                        
                        # Skip if it's external or not related to jibaya
                        if 'jibaya' not in doc_url:
                            continue
                            
                        # Skip if already processed this URL
                        if doc_url in self.processed_urls:
                            continue
                        
                        self.processed_urls.add(doc_url)
                            
                        print(f"Following potential document page: {link_text} - {doc_url}")
                        doc_response = self.session.get(doc_url, verify=self.verify_ssl)
                        doc_response.raise_for_status()
                        
                        doc_soup = BeautifulSoup(doc_response.text, 'html.parser')
                        
                        # Process this page for metadata
                        self._process_page_for_metadata(doc_soup, doc_url, documents)
                        
                        # Wait a bit to be polite
                        time.sleep(random.uniform(0.5, 1.5))
                        
                    except Exception as e:
                        print(f"Error processing page {href}: {e}")
            
            # Save metadata to JSON file
            with open(self.metadata_file, 'w', encoding='utf-8') as f:
                json.dump({
                    'metadata_collected_at': datetime.now().isoformat(),
                    'documents': documents
                }, f, ensure_ascii=False, indent=2)
                
            print(f"Metadata collected for {len(documents)} documents and saved to {self.metadata_file}")
            return documents
            
        except Exception as e:
            print(f"Error collecting metadata: {e}")
            return []
    
    def _process_page_for_metadata(self, soup, page_url, documents):
        """
        Process a page to extract metadata for PDF documents.
        
        Args:
            soup: BeautifulSoup object for the page
            page_url: URL of the page being processed
            documents: List to append document metadata to
        """
        # Find all links that might be PDFs
        for link in soup.find_all('a'):
            if not link.has_attr('href'):
                continue
                
            href = link['href']
            link_text = link.get_text().strip()
            
            # Check if it's a PDF link
            is_pdf = href.lower().endswith('.pdf')
            
            if is_pdf:
                # Build full URL if needed
                full_url = urljoin(page_url, href)
                
                # Skip if already processed this URL
                if full_url in self.processed_urls:
                    continue
                
                self.processed_urls.add(full_url)
                
                # Extract year from both link text and URL
                year = self._extract_year_from_text(link_text)
                if year is None:
                    # Try to extract from URL if not in link text
                    year = self._extract_year_from_text(href)
                
                # Skip if unknown year and skip_unknown_year is True
                if year is None and self.skip_unknown_year:
                    print(f"Skipping document with unknown year: {link_text}")
                    continue
                
                # Skip if older than min_year
                if year is not None and year < self.min_year:
                    print(f"Skipping document from {year} (older than {self.min_year}): {link_text}")
                    continue
                
                # Get document type
                doc_type = self._get_document_type(link_text)
                
                # Skip documents categorized as "autres_documents"
                if doc_type == "autres_documents":
                    print(f"Skipping document of type 'autres_documents': {link_text}")
                    continue
                
                # Get filename from URL
                filename = href.split('/')[-1].split('?')[0]
                
                # Create document metadata
                doc_metadata = {
                    'url': full_url,
                    'title': link_text,
                    'type': doc_type,
                    'year': year,
                    'filename': filename,
                    'source_page': page_url,
                    'found_at': datetime.now().isoformat()
                }
                
                documents.append(doc_metadata)
                print(f"Found document: {link_text} ({doc_type}, {year if year else 'Year unknown'})")
    
    def download_from_metadata(self, metadata=None):
        """
        Download documents based on the collected metadata.
        
        Args:
            metadata: Optional metadata dictionary. If None, load from JSON file.
            
        Returns:
            Number of documents downloaded
        """
        try:
            # Load metadata if not provided
            if metadata is None:
                if not os.path.exists(self.metadata_file):
                    print(f"Metadata file {self.metadata_file} not found. Run collect_metadata() first.")
                    return 0
                    
                with open(self.metadata_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    documents = data['documents']
            else:
                documents = metadata
                
            download_count = 0
            download_results = []
            downloaded_urls = set()  # Track downloaded URLs to avoid duplicates
            
            # Download each document
            for doc in documents:
                url = doc['url']
                doc_type = doc['type']
                
                # Skip documents categorized as "autres_documents"
                if doc_type == "autres_documents":
                    print(f"Skipping document of type 'autres_documents': {doc['title']}")
                    continue
                    
                filename = doc['filename']
                
                # Skip if already downloaded this URL
                if url in downloaded_urls:
                    print(f"Skipping duplicate URL: {url}")
                    doc['download_status'] = 'skipped_duplicate'
                    download_results.append(doc)
                    continue
                
                downloaded_urls.add(url)
                
                # Create type subfolder if it doesn't exist
                subfolder = os.path.join(self.output_folder, doc_type)
                if not os.path.exists(subfolder):
                    os.makedirs(subfolder)
                
                # Full path for saving
                save_path = os.path.join(subfolder, filename)
                
                # Skip if file already exists
                if os.path.exists(save_path):
                    print(f"File already exists: {save_path}")
                    doc['download_status'] = 'already_exists'
                    doc['local_path'] = save_path
                    download_results.append(doc)
                    continue
                
                try:
                    # Download the file
                    print(f"Downloading: {filename} to {save_path}")
                    response = self.session.get(url, stream=True, verify=self.verify_ssl)
                    response.raise_for_status()
                    
                    with open(save_path, 'wb') as f:
                        for chunk in response.iter_content(chunk_size=8192):
                            if chunk:
                                f.write(chunk)
                    
                    # Update metadata
                    doc['download_status'] = 'success'
                    doc['local_path'] = save_path
                    doc['downloaded_at'] = datetime.now().isoformat()
                    download_count += 1
                    
                except Exception as e:
                    print(f"Error downloading {url}: {e}")
                    doc['download_status'] = 'failed'
                    doc['error'] = str(e)
                
                download_results.append(doc)
                
                # Wait to be polite to the server
                time.sleep(random.uniform(1, 3))
            
            # Save updated metadata with download results
            with open('jibaya_download_results.json', 'w', encoding='utf-8') as f:
                json.dump({
                    'downloaded_at': datetime.now().isoformat(),
                    'total_documents': len(download_results),  # Only counting documents that weren't skipped
                    'successful_downloads': download_count,
                    'documents': download_results
                }, f, ensure_ascii=False, indent=2)
            
            print(f"Downloaded {download_count} out of {len(download_results)} documents.")
            print(f"Download results saved to jibaya_download_results.json")
            
            return download_count
            
        except Exception as e:
            print(f"Error downloading from metadata: {e}")
            return 0
    
    def run_full_process(self):
        """Run the full process: collect metadata and then download documents."""
        metadata = self.collect_metadata()
        if metadata:
            self.download_from_metadata(metadata)


# Example usage
if __name__ == "__main__":
    # Disable SSL warnings
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    collector = JibayaMetadataCollector(
        docs_url="http://jibaya.tn/docs/",
        min_year=2021,
        output_folder="documents_fiscaux", 
        metadata_file="jibaya_metadata.metajson",
        verify_ssl=False,
        skip_unknown_year=True  # Skip documents with unknown years
    )
    
    # Option 1: Run the whole process at once
    collector.run_full_process()
    
    # Option 2: Run the process in two separate steps
    # Step 1: Collect metadata only
    # metadata = collector.collect_metadata()
    
    # Step 2: Download using the metadata
    # collector.download_from_metadata()

Accessing docs page: http://jibaya.tn/docs/
Following potential document page: Direction Générale des Impôts - https://jibaya.tn/dgi/
Following potential document page: Direction Générale des Etudes et de la Législation Fiscales - https://jibaya.tn/dgelf/
Following potential document page: Documentation Fiscale - https://jibaya.tn/documentation/
Following potential document page: Documents à Télécharger - https://jibaya.tn/formulaires-a-telecharger/
Skipping document of type 'autres_documents': Télécharger
Skipping document of type 'autres_documents': Télécharger
Skipping document of type 'autres_documents': Télécharger
Skipping document of type 'autres_documents': Télécharger
Skipping document of type 'autres_documents': Télécharger
Skipping document of type 'autres_documents': Télécharger
Skipping document of type 'autres_documents': Télécharger
Skipping document of type 'autres_documents': Télécharger
Skipping document of type 'autres_documents': Télécharger
Skipping document of typ

In [4]:
!cat "/kaggle/working/jibaya_download_results.json"

{
  "downloaded_at": "2025-04-07T11:21:23.376797",
  "total_documents": 284,
  "successful_downloads": 281,
  "documents": [
    {
      "url": "http://jibaya.tn/wp-content/uploads/2023/12/Liasse_fiscale-CCT-BANQUES-1.pdf",
      "title": "Secteur des banques et des établissements financiers",
      "type": "lois_de_finances",
      "year": 2023,
      "filename": "Liasse_fiscale-CCT-BANQUES-1.pdf",
      "source_page": "https://jibaya.tn/blog/depot-dematerialise-de-la-liasse-fiscale/",
      "found_at": "2025-04-07T10:39:22.419110",
      "download_status": "success",
      "local_path": "documents_fiscaux/lois_de_finances/Liasse_fiscale-CCT-BANQUES-1.pdf",
      "downloaded_at": "2025-04-07T11:08:32.801226"
    },
    {
      "url": "https://jibaya.tn/wp-content/uploads/2024/02/RECUEIL-DES-TEXTES-RELATIFS-AUX-DROITS-ET-TAXES-NON-INCORPORES-DANS-LES-CODES-FISCAUX-2023_compressed.pdf",
      "title": "RECUEIL-DES-TEXTES-RELATIFS-AUX-DROITS-ET-TAXES-NON-INCORPORES-DANS-LES-CODES-FISCAUX