In [30]:
import requests

# URL for the API
url = "https://api.archives-ouvertes.fr/ref/domain/?wt=xml&q=level_i:%220%22"

# Make a GET request to the API
response = requests.get(url)

# Print the response content
if response.status_code == 200:
    print(response.text)
else:
    print(f"Error: {response.status_code}")

<?xml version="1.0" encoding="UTF-8"?>
<response>

<result name="response" numFound="13" start="0" numFoundExact="true">
  <doc>
    <str name="docid">1</str>
    <str name="label_s">chim = Chimie</str></doc>
  <doc>
    <str name="docid">16</str>
    <str name="label_s">info = Informatique [cs]</str></doc>
  <doc>
    <str name="docid">70</str>
    <str name="label_s">math = Mathématiques [math]</str></doc>
  <doc>
    <str name="docid">103</str>
    <str name="label_s">nlin = Science non linéaire [physics]</str></doc>
  <doc>
    <str name="docid">109</str>
    <str name="label_s">phys = Physique [physics]</str></doc>
  <doc>
    <str name="docid">357</str>
    <str name="label_s">qfin = Économie et finance quantitative [q-fin]</str></doc>
  <doc>
    <str name="docid">162</str>
    <str name="label_s">scco = Sciences cognitives</str></doc>
  <doc>
    <str name="docid">167</str>
    <str name="label_s">sde = Sciences de l'environnement</str></doc>
  <doc>
    <str name="docid">171</

## CODE VALIDE


In [31]:
# Import the necessary libraries
import requests
import xml.etree.ElementTree as ET
import numpy as np

In [32]:
# Define all functions

# Step 1: Fetch domains with their meanings
def fetch_domains():
    url = "https://api.archives-ouvertes.fr/ref/domain/?wt=xml&q=level_i:%220%22"
    response = requests.get(url)
    
    domains = []
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        
        for domain in root.findall(".//doc"):
            label = domain.find(".//str[@name='label_s']")
            if label is not None:
                domain_info = label.text.split(" = ")
                if len(domain_info) == 2:
                    domain_code, domain_meaning = domain_info
                    domains.append((domain_code, domain_meaning))
    else:
        print(f"Error fetching domains: {response.status_code}")
    return domains


#define a function to fetch data for a certain domain and a specific doc type
def fetch_data_1(domain_code, doc_type):
    base_url = "https://api.archives-ouvertes.fr/search/"
    query_params = {
        "q": f"level0_domain_s:\"{domain_code}\"",
        "fq": [
            "submittedDateY_i:[2021 TO 2024]",
            "fr_abstract_s:[\"\" TO *]",
            "docType_s:[\"\" TO *]",
            "primaryDomain_s:[\"\" TO *]",
            "language_t:\"fr\"",
            f"docType_s:\"{doc_type}\""
            "-fr_abstract_s:\"None\"",
        ],
        "fl": "docType_t,level0_domain_s,docid,title_s,fr_abstract_s",
        "rows": "2",
        "wt": "xml"
    }
    response = requests.get(base_url, params=query_params)

    print(domain_code, doc_type) 

    if response.status_code == 200:
        return response.content
    else:
        print(f"Error fetching data for domain {domain_code}: {response.status_code}")
        return None


#define a function to fetch data for a certain domain and a specific doc type and date

def fetch_data(domain_code, doc_type, date):
    base_url = "https://api.archives-ouvertes.fr/search/"
    query_params = {
        "q": f"level0_domain_s:\"{domain_code}\"",
        "fq": [
            f"submittedDateY_i:{date}",
            "fr_abstract_s:[\"\" TO *]",
            "docType_s:[\"\" TO *]",
            "primaryDomain_s:[\"\" TO *]",
            "language_t:\"fr\"",
            f"docType_s:\"{doc_type}\""
            "-fr_abstract_s:\"None\"",
        ],
        "fl": "docType_t,level0_domain_s,docid,title_s,fr_abstract_s",
        "rows": "2",
        "wt": "xml"
    }
    response = requests.get(base_url, params=query_params)

    print(domain_code, doc_type, date) 

    if response.status_code == 200:
        return response.content
    else:
        print(f"Error fetching data for domain {domain_code}: {response.status_code}")
        return None
    
    
# define a function with a list of document types to fetch data for a certain domain list and certain year list

def fetch_domain_doctype_data(domain_list, doc_types, dates):
    for i in range(len(domain_list)-1):
        for doc_type in doc_types:
            for date in dates:
                fetch_data(domain_list[i][0], doc_type, date)


In [33]:
# Define the list of document types
doc_types = ["THESE", "COUV", "MEM", "ART", "REPORT", "COMM", "POSTER", "OTHER", "OUV"]

# Retrieve the domains
domains = fetch_domains()

#Define the list of years
dates = np.array([2021, 2022, 2023, 2024])


In [None]:
fetch_data_1(domains[0][0], doc_types[0])

In [None]:
fetch_data(domains[0][0], doc_types[0], dates[0])

In [None]:
fetch_data(domains[0][0], doc_types[0], dates[1])

In [None]:
fetch_data(domains[0][0], doc_types[0], dates[2])

In [None]:
fetch_data(domains[0][0], doc_types[0], dates[3])

In [None]:
fetch_data(domains[0][0], doc_types[0], dates[-1])

## TESTS


In [None]:
fetch_data_1(domains[0][0], doc_types[0])

In [None]:
print(len(domains))

In [None]:
for domain_code in domains:
    print(domain_code)

In [None]:
# Iterate over the domains and fetch data for each one

for i in range(len(domains)-1):
    # URL for the API
    url = "https://api.archives-ouvertes.fr/ref/domain/?wt=xml&q=level_i:%220%22"

    # Make a GET request to the API
    response = requests.get(url)
    base_url = "https://api.archives-ouvertes.fr/search/"
    query_params = {
        "q": f"level0_domain_s:\"{domains[i][0]}\"",
        "fq": [
            "submittedDateY_i:[2020 TO *]",
            "fr_abstract_s:[\"\" TO *]",
            "docType_s:[\"\" TO *]",
            "primaryDomain_s:[\"\" TO *]",
            "language_t:\"fr\""
            "-fr_abstract_s:\"None\"",
            "docType_s:(\"THESE\" OR \"COUV\" OR \"MEM\" OR \"ART\" OR \"REPORT\" OR \"COMM\" OR \"POSTER\" OR \"OTHER\" OR \"OUV\")",
        ],
        "fl": "docType_t,level0_domain_s,docid,title_s,fr_abstract_s",
        "group": "true",
        "group.field": "docType_s",
        "indent": "true",
        "wt": "xml"
        }

    response = requests.get(base_url, params=query_params)

    # Print the response content
    if response.status_code == 200:
        print(response.text)
    else:
        print(f"Error: {response.status_code}")

    print(domains[i][0])
        

In [None]:
######FAVORITE CODE PORTION FOR NOW --> GETTING WORKED ON##########

# Step 1: Fetch domains with their meanings
def fetch_domains():
    url = "https://api.archives-ouvertes.fr/ref/domain/?wt=xml&q=level_i:%220%22"
    response = requests.get(url)
    
    domains = []
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        
        for domain in root.findall(".//doc"):
            label = domain.find(".//str[@name='label_s']")
            if label is not None:
                domain_info = label.text.split(" = ")
                if len(domain_info) == 2:
                    domain_code, domain_meaning = domain_info
                    domains.append((domain_code, domain_meaning))
    else:
        print(f"Error fetching domains: {response.status_code}")
    return domains

# Retrieve the domains
domains = fetch_domains()

# URL for the API
url = "https://api.archives-ouvertes.fr/ref/domain/?wt=xml&q=level_i:%220%22"

# Make a GET request to the API
response = requests.get(url)

base_url = "https://api.archives-ouvertes.fr/search/"
query_params = {
    "q": f"level0_domain_s:\"{domains[12][0]}\"",
    "fq": [
        "submittedDateY_i:[2020 TO *]",
        "fr_abstract_s:[\"\" TO *]",
        "docType_s:[\"\" TO *]",
        "primaryDomain_s:[\"\" TO *]",
        "language_t:\"fr\""
        "-fr_abstract_s:\"None\"",
        "docType_s:(\"THESE\" OR \"COUV\" OR \"MEM\" OR \"ART\" OR \"REPORT\" OR \"COMM\" OR \"POSTER\" OR \"OTHER\" OR \"OUV\")",
    ],
    "fl": "docType_t,level0_domain_s,docid,title_s,fr_abstract_s",
    "group": "true",
    "group.field": "docType_s",
    "indent": "true",
    "wt": "xml"
    }

response = requests.get(base_url, params=query_params)

# Print the response content
if response.status_code == 200:
    print(response.text)
else:
    print(f"Error: {response.status_code}")

print(domains[12][0])

## SAVING AND CURLING DATA


In [64]:
import os
import subprocess


def fetch_data_curl(domain_code, doc_type, date):
    # Create the output directory based on year, doc type, and domain
    output_directory = f"{date}/{doc_type}/{domain_code}"
    os.makedirs(output_directory, exist_ok=True)
    output_file = f"{output_directory}/results.json"

    # Construct the API URL
    url = (
        f"https://api.archives-ouvertes.fr/search/?q=level0_domain_s:\"{domain_code}\""
        f"&fq=submittedDateY_i:{date}"
        f"&fq=docType_s:\"{doc_type}\""
        f"&fq=language_t:\"fr\""
        f"&fq=-fr_abstract_s:\"\""
        f"&fq=-fr_abstract_s:\"None\""
        f"&fq=fr_abstract_s:*"
        f"&fl=docType_t,level0_domain_s,docid,title_s,fr_abstract_s"
        f"&rows=100&wt=json"
    )

    # Curl command to fetch data and save it to the specified output file
    curl_command = f"curl -s '{url}' -o {output_file}"

    # Run the curl command
    subprocess.run(curl_command, shell=True)
    print(f"Data saved to {output_file}")

In [65]:
def fetch_domain_doctype_data_curl(domain_list, doc_types, dates):
    for domain_code, _ in domain_list:
        for doc_type in doc_types:
            for date in dates:
                fetch_data_curl(domain_code, doc_type, date)

In [66]:
fetch_domain_doctype_data_curl(domains, doc_types, dates)

Data saved to 2021/THESE/chim/results.json
Data saved to 2022/THESE/chim/results.json
Data saved to 2023/THESE/chim/results.json
Data saved to 2024/THESE/chim/results.json
Data saved to 2021/COUV/chim/results.json
Data saved to 2022/COUV/chim/results.json
Data saved to 2023/COUV/chim/results.json
Data saved to 2024/COUV/chim/results.json
Data saved to 2021/MEM/chim/results.json
Data saved to 2022/MEM/chim/results.json
Data saved to 2023/MEM/chim/results.json
Data saved to 2024/MEM/chim/results.json
Data saved to 2021/ART/chim/results.json
Data saved to 2022/ART/chim/results.json
Data saved to 2023/ART/chim/results.json
Data saved to 2024/ART/chim/results.json
Data saved to 2021/REPORT/chim/results.json
Data saved to 2022/REPORT/chim/results.json
Data saved to 2023/REPORT/chim/results.json
Data saved to 2024/REPORT/chim/results.json
Data saved to 2021/COMM/chim/results.json
Data saved to 2022/COMM/chim/results.json
Data saved to 2023/COMM/chim/results.json
Data saved to 2024/COMM/chim/r