## CODE VALIDE


In [1]:
# Import the necessary libraries
import os
import subprocess
import requests
import json
import xml.etree.ElementTree as ET

In [2]:
# Define all functions

# Step 1: Fetch domains with their meanings
def fetch_domains():
    url = "https://api.archives-ouvertes.fr/ref/domain/?wt=xml&q=level_i:%220%22"
    response = requests.get(url)
    
    domains = []
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        
        for domain in root.findall(".//doc"):
            label = domain.find(".//str[@name='label_s']")
            if label is not None:
                domain_info = label.text.split(" = ")
                if len(domain_info) == 2:
                    domain_code, domain_meaning = domain_info
                    domains.append((domain_code, domain_meaning))
    else:
        print(f"Error fetching domains: {response.status_code}")
    return domains


def fetch_test_data_curl_2021(domain_code):
    # Create the output directory based on domain
    output_directory = f"Test_pos/2021/{domain_code}"
    os.makedirs(output_directory, exist_ok=True)
    output_file = f"{output_directory}/test_data_pos_ART_2021.json"

    # Construct the API URL
    url = (
        f"https://api.archives-ouvertes.fr/search/?q=level0_domain_s:\"{domain_code}\""
        f"&fq=docType_s:\"ART\""
        f"&fq=language_t:\"fr\""
        f"&fq=-fr_abstract_s:\"\""
        f"&fq=-fr_abstract_s:\"None\""
        f"&fq=fr_abstract_s:*"
        #f"&fq=fileMain_s:*"
        f"&fq=submittedDateY_i:2021"
        f"&fl=docType_t,level0_domain_s,docid,title_s,fr_abstract_s,fileMain_s"
        f"&rows=50&wt=json"
    )

    # Curl command to fetch data and save it to the specified output file
    curl_command = f"curl -s '{url}' -o {output_file}"

    # Run the curl command
    subprocess.run(curl_command, shell=True)
    print(f"Data saved to {output_file}")


def fetch_test_data_curl_2022(domain_code):
    # Create the output directory based on domain
    output_directory = f"Test_pos/2022/{domain_code}"
    os.makedirs(output_directory, exist_ok=True)
    output_file = f"{output_directory}/test_data_pos_ART_2022.json"

    # Construct the API URL
    url = (
        f"https://api.archives-ouvertes.fr/search/?q=level0_domain_s:\"{domain_code}\""
        f"&fq=docType_s:\"ART\""
        f"&fq=language_t:\"fr\""
        f"&fq=-fr_abstract_s:\"\""
        f"&fq=-fr_abstract_s:\"None\""
        f"&fq=fr_abstract_s:*"
        #f"&fq=fileMain_s:*"
        f"&fq=submittedDateY_i:2022"
        f"&fq=-submittedDateM_i:12"
        f"&fl=docType_t,level0_domain_s,docid,title_s,fr_abstract_s,fileMain_s"
        f"&rows=50&wt=json"
    )

    # Curl command to fetch data and save it to the specified output file
    curl_command = f"curl -s '{url}' -o {output_file}"

    # Run the curl command
    subprocess.run(curl_command, shell=True)
    print(f"Data saved to {output_file}")


def fetch_domain_test_data_curl_2021(domain_list):
    for domain_code, _ in domain_list:
        fetch_test_data_curl_2021(domain_code)

def fetch_domain_test_data_curl_2022(domain_list):
    for domain_code, _ in domain_list:
        fetch_test_data_curl_2022(domain_code)



In [3]:

# Retrieve the domains
domains_1 = fetch_domains()
domains_2 = fetch_domains() 


## SAVING AND CURLING DATA


In [None]:
fetch_domain_test_data_curl_2021(domains_1)

In [None]:
fetch_domain_test_data_curl_2022(domains_2)

### CLEAN AND FILTER RESULTS TO ONLY KEEP ONES WITH AN URL

In [4]:
import os
import json

def clean_and_filter_results(base_directory, output_directory):
    # Iterate over years (e.g., 2021, 2022)
    for date in os.listdir(base_directory):
        date_path = os.path.join(base_directory, date)
        if not os.path.isdir(date_path):
            continue

        # Iterate over domain codes (e.g., shs, sdv, spi)
        for domain_code in os.listdir(date_path):
            domain_path = os.path.join(date_path, domain_code)
            if not os.path.isdir(domain_path):
                continue

            # Process each JSON file in the domain folder
            for filename in os.listdir(domain_path):
                if not filename.endswith('.json'):
                    continue

                file_path = os.path.join(domain_path, filename)
                
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                except (json.JSONDecodeError, FileNotFoundError) as e:
                    print(f"Error reading {filename}: {e}")
                    continue

                docs = data.get('response', {}).get('docs', [])
                if not docs:
                    print(f"No documents found in {filename}. Skipping.")
                    continue

                seen_docids = set()
                filtered_docs = []

                # Filter docs with 'fileMain_s' and remove duplicates
                for doc in docs:
                    docid = doc.get('docid')
                    if not docid or docid in seen_docids:
                        continue
                    if 'fileMain_s' in doc:
                        filtered_docs.append(doc)
                        seen_docids.add(docid)
                    if len(filtered_docs) == 20:
                        break

                # Update and save the cleaned results
                if filtered_docs:
                    data['response']['docs'] = filtered_docs
                    data['response']['numFound'] = len(filtered_docs)

                    output_folder = os.path.join(output_directory, date, domain_code)
                    os.makedirs(output_folder, exist_ok=True)

                    output_file = os.path.join(output_folder, filename)
                    try:
                        with open(output_file, 'w') as f:
                            json.dump(data, f, indent=4)
                        print(f"Updated and saved: {output_file}")
                    except IOError as e:
                        print(f"Error writing to {output_file}: {e}")
                else:
                    print(f"No valid results with 'fileMain_s' in {filename}")





In [None]:
# Set the base and output directories
base_dir = 'Test_pos'
output_dir = 'Cleaned_Test_pos'
clean_and_filter_results(base_dir, output_dir)

### DOWNLOAD DOCUMENTS

In [5]:
import os
import requests

def download_document(url, save_path):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {save_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download from {url}: {e}")

def download_documents(input_directory, download_directory):
    os.makedirs(download_directory, exist_ok=True)

    for root, _, files in os.walk(input_directory):
        for filename in files:
            if not filename.endswith('.json'):
                continue

            file_path = os.path.join(root, filename)
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
            except json.JSONDecodeError:
                print(f"Failed to read {filename}. Skipping.")
                continue

            docs = data.get('response', {}).get('docs', [])
            for doc in docs:
                docid = doc.get('docid')
                file_url = doc.get('fileMain_s')

                if not file_url:
                    continue

                save_path = os.path.join(download_directory, f"{docid}.pdf")
                download_document(file_url, save_path)





In [None]:
# Example usage
input_dir = 'Cleaned_Test_pos'
download_dir = 'Downloaded_Documents'
download_documents(input_dir, download_dir)

In [6]:
import os
import json
import requests

def download_document(url, save_path):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {save_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download from {url}: {e}")

def download_documents(input_directory, download_directory):
    for root, _, files in os.walk(input_directory):
        for filename in files:
            if not filename.endswith('.json'):
                continue

            file_path = os.path.join(root, filename)
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
            except json.JSONDecodeError:
                print(f"Failed to read {filename}. Skipping.")
                continue

            docs = data.get('response', {}).get('docs', [])
            for doc in docs:
                docid = doc.get('docid')
                file_url = doc.get('fileMain_s')

                if not file_url:
                    continue

                # Extract year and domain from the directory structure
                parts = root.split(os.sep)
                year, domain_code = parts[-2], parts[-1]

                # Create the corresponding output directory
                output_path = os.path.join(download_directory, year, domain_code)
                os.makedirs(output_path, exist_ok=True)

                save_path = os.path.join(output_path, f"{docid}.pdf")
                download_document(file_url, save_path)




In [None]:
# Example usage
input_dir = 'Cleaned_Test_pos'
download_dir = 'Downloaded_Documents'
download_documents(input_dir, download_dir)

### TEST NOUGAT 

In [7]:
!pip install -q pymupdf python-Levenshtein nltk

In [8]:
!pip install -q git+https://github.com/huggingface/transformers.git

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps://github.com/huggingface/transformers.git[0m[32m [0m[32m/private/var/folders/0d/c55v27zd62j542ptl29_r8840000gq/T/[0m[32mpip-req-build-bcp5wzxd[0m did not run successfully.
  [31m│[0m exit code: [1;36m128[0m
  [31m╰─>[0m [31m[5 lines of output][0m
  [31m   [0m error: RPC failed; curl 92 HTTP/2 stream 0 was not closed cleanly: CANCEL (err 8)
  [31m   [0m error: 2146 bytes of body are still expected
  [31m   [0m fetch-pack: unexpected disconnect while reading sideband packet
  [31m   [0m fatal: early EOF
  [31m   [0m fatal: fetch-pack: invalid index-pack output
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mgit clone --[0m[32mfilter[

In [2]:
import os
import fitz  # PyMuPDF
import torch
from transformers import AutoProcessor, VisionEncoderDecoderModel
from huggingface_hub import snapshot_download
import re
from PIL import Image
import io

# Download the NOUGAT model locally
local_dir = snapshot_download(repo_id="facebook/nougat-small")

# Load the model and processor from the local directory
processor = AutoProcessor.from_pretrained(local_dir)
model = VisionEncoderDecoderModel.from_pretrained(local_dir)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

### EXTRACT CONTENT 

In [None]:
import os
import pdfplumber
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load NOUGAT model from Hugging Face
model_name = "facebook/nougat-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def extract_content_with_nougat(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            content = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    content += page_text + "\n"

        # Split content into smaller chunks for NOUGAT
        chunk_size = 512
        chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]

        extracted_text = ""
        for chunk in chunks:
            inputs = tokenizer(chunk, return_tensors="pt", truncation=True)
            outputs = model.generate(**inputs)
            extracted_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
            extracted_text += extracted_chunk + "\n"

        # Remove "Résumé" or "Abstract" sections
        extracted_text = re.sub(r"(?i)(Résumé|Abstract)[:\-]*[^\n]*(\n|$)", "", extracted_text)

        return extracted_text

    except Exception as e:
        print(f"Failed to process {file_path}: {e}")
        return ""

def process_pdfs_with_nougat(input_directory, output_directory):
    for root, _, files in os.walk(input_directory):
        for filename in files:
            if not filename.endswith('.pdf'):
                continue

            file_path = os.path.join(root, filename)
            cleaned_content = extract_content_with_nougat(file_path)

            if cleaned_content:
                # Maintain directory structure
                parts = root.split(os.sep)
                year, domain_code = parts[-2], parts[-1]
                output_path = os.path.join(output_directory, year, domain_code)
                os.makedirs(output_path, exist_ok=True)

                output_file = os.path.join(output_path, f"{os.path.splitext(filename)[0]}_content.txt")
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(cleaned_content)
                print(f"Processed and saved content to: {output_file}")

# Example usage
input_dir = 'Downloaded_Documents'
output_dir = 'Processed_Documents'
process_pdfs_with_nougat(input_dir, output_dir)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
import pdfplumber
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load NOUGAT model from Hugging Face
model_name = "facebook/nougat-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def extract_content_with_nougat(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            content = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    content += page_text + "\n"

        # Split content into smaller chunks for NOUGAT
        chunk_size = 512
        chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]

        extracted_text = ""
        for chunk in chunks:
            inputs = tokenizer(chunk, return_tensors="pt", truncation=True)
            outputs = model.generate(**inputs)
            extracted_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
            extracted_text += extracted_chunk + "\n"

        # Remove "Résumé" or "Abstract" sections
        extracted_text = re.sub(r"(?i)(Résumé|Abstract)[:\-]*[^\n]*(\n|$)", "", extracted_text)

        return extracted_text

    except Exception as e:
        print(f"Failed to process {file_path}: {e}")
        return ""

def process_pdfs_with_nougat(input_directory, output_directory):
    for root, _, files in os.walk(input_directory):
        for filename in files:
            if not filename.endswith('.pdf'):
                continue

            file_path = os.path.join(root, filename)
            cleaned_content = extract_content_with_nougat(file_path)

            if cleaned_content:
                # Maintain directory structure
                parts = root.split(os.sep)
                year, domain_code = parts[-2], parts[-1]
                output_path = os.path.join(output_directory, year, domain_code)
                os.makedirs(output_path, exist_ok=True)

                output_file = os.path.join(output_path, f"{os.path.splitext(filename)[0]}_content.txt")
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(cleaned_content)
                print(f"Processed and saved content to: {output_file}")

# Example usage
input_dir = 'Downloaded_Documents'
output_dir = 'Processed_Documents'
process_pdfs_with_nougat(input_dir, output_dir)


  from .autonotebook import tqdm as notebook_tqdm
