In [1]:
import pandas as pd
from googlesearch import search # pip install googlesearch-python
import requests
import urllib3
import json
import time
import random

In [2]:
file_path = "col_ies.xlsx"
data = pd.read_excel(file_path, header=None)
ies_list = data.iloc[:, 0].dropna().tolist()

len(ies_list), ies_list[0]

(326, 'UNIVERSIDAD NACIONAL DE COLOMBIA')

In [3]:
ies_list_test = ies_list[0:10]

In [4]:
ies_list_test

['UNIVERSIDAD NACIONAL DE COLOMBIA',
 'UNIVERSIDAD PEDAGOGICA NACIONAL',
 'UNIVERSIDAD PEDAGOGICA Y TECNOLOGICA DE COLOMBIA - UPTC',
 'UNIVERSIDAD DEL CAUCA',
 'UNIVERSIDAD TECNOLOGICA DE PEREIRA - UTP',
 'UNIVERSIDAD DE CALDAS',
 'UNIVERSIDAD DE CORDOBA',
 'UNIVERSIDAD SURCOLOMBIANA',
 'UNIVERSIDAD DE LA AMAZONIA',
 'UNIVERSIDAD MILITAR-NUEVA GRANADA']

In [5]:
import urllib3
from googlesearch import search
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse


found_dspace = []
not_found_dspace = set()

# Disable warnings for insecure requests
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def reposity_search(query, lang="es", num=5, pause=1):
    try:
        return search(query, lang=lang, num_results=num, sleep_interval=pause)
    except Exception as e:
        raise RuntimeError(f"Error performing search for query '{query}': {e}")

def reposity_urls(ies_list):
    # time.sleep(random.uniform(0, 1))
    results = []
    for i, ie in enumerate(ies_list):
        entry = {
            "institution": ie,
            "results": []}
        ie_query = f'{ie.lower().replace(" ", "+")}+dspace'
        try:
            entry["results"] = list(reposity_search(ie_query))
            if entry["results"]:
                print(f"{i} - {entry['institution'].title()} - urls candidates for DSPACE repository have been found.")
            results.append(entry)
        except Exception as e:
            print(f"Error while searching '{ie_query}': {e}")
    return results

def get_ror_id(name):
    url = f"https://api.ror.org/organizations?query={name}"
    
    response = requests.get(url, timeout=240)
    
    if response.status_code == 200:
        data = response.json()
        
        if data.get("items"):
            ror_id = data["items"][0]["id"]
            return ror_id
        else:
            #if for that org name not found
            return "Error: org id not found"
    else:
        return f"Error: {response.status_code}"

def has_oai(url, max_retries=3, timeout=180):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        )
    }
    suffixes = [
        "server/oai/request?verb=Identify",
        "oai/request?verb=Identify",
    ]
    for suffix in suffixes:
        retries = 0
        while retries < max_retries:
            try:
                request_url = url.rstrip("/") + "/" + suffix
                print(request_url)
                req = requests.get(request_url + suffix, verify=False, headers=headers, timeout=180)
                soup = BeautifulSoup(req.text, features="xml")
                namespace = soup.find("OAI-PMH")
                if req.status_code == 200 and namespace:
                    return True
            except:
                pass
            retries += 1
    return False

def has_oai_identifiers(url, max_retries=2, timeout=180):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/91.0.4472.124 Safari/537.36"
        )
    }
    # Possible OAI-PMH suffixes to test
    suffixes = [
        "server/oai/request?verb=ListIdentifiers&metadataPrefix=oai_dc",
        "oai/request?verb=ListIdentifiers&metadataPrefix=oai_dc",
    ]
    for suffix in suffixes:
        # Construct the full URL, ensuring we don't duplicate slashes
        request_url = url.rstrip("/") + "/" + suffix
        retries = 0
        while retries < max_retries:
            try:
                # Send a GET request with specified headers and timeout
                response = requests.get(
                    request_url,
                    verify=False,
                    headers=headers,
                    timeout=180
                )
                if response.status_code == 200:
                    # Parse the XML response with BeautifulSoup
                    soup = BeautifulSoup(response.text, "xml")
                    namespace = soup.find("OAI-PMH")
                    # If <OAI-PMH> is found, we assume it's a valid OAI-PMH endpoint
                    if namespace is not None:
                        return suffix
            except requests.RequestException:
                pass
            retries += 1
    return None

def clean_url(url):
    parsed_url = urlparse(url)
    return f"{parsed_url.scheme}://{parsed_url.netloc}"

def dspace_validation(candidates_urls):
    print(f"\nStarting repository validation: \n")
    endpoints = {}
    for i, candidate in enumerate(candidates_urls):
        for raw_url in candidate["results"]:
            if ".co" not in raw_url or raw_url.count("/") >= 5:
                continue
            url = clean_url(raw_url)
            # Measure the time taken for the request
            start_time = time.time()
            if has_oai(url):
                end_time = time.time()
                oai_identifiers = has_oai_identifiers(url)
                response_time = round(end_time - start_time, 2)
                # Extract institution acronym from URL
                key = f'dspace_{url.split("//")[1].split(".")[1]}'
                if key in endpoints.keys():
                    continue
                endpoints[key] = {
                    "enabled": True,
                    "name": candidate['institution'],
                    "ror": get_ror_id(candidate['institution'].lower()),
                    "url": url + "oai/request",
                    "identifiers_url": url + oai_identifiers if oai_identifiers else "",
                    "request_timeout": response_time
                }
                found_dspace.append(candidate['institution'])
                print(f"{i} - {candidate['institution'].title()} - DSPACE repository have been found successfully.")
                not_found_dspace.discard(candidate['institution'])
                break
            else:
                print(f"{i} - {candidate['institution'].title()} - DSPACE repository not been found.")
                not_found_dspace.add(candidate['institution'])
        print("\n")

    return endpoints

def save_to_json(data, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)
        print(f"\nLog file successfully saved: {filename}")
    except Exception as e:
        print(f"Failed to save data: {filename}: {e}")

def print_config(name, url, idx_url, org_name, ror, enabled, file):
    file.write(f"endpoints['{name}'] = {{}}\n")
    file.write(f"endpoints['{name}']['enabled'] = {enabled}\n")
    file.write(f"endpoints['{name}']['name'] = '{org_name}'\n")
    file.write(f"endpoints['{name}']['ror'] = '{ror}'\n")
    file.write(f"endpoints['{name}']['url'] = '{url}'\n")
    # file.write(f"endpoints['{name}']['identifiers_url'] = '{idx_url}'\n")
    file.write(f"endpoints['{name}']['metadataPrefix'] = 'dim'\n")
    file.write(f"endpoints['{name}']['rate_limit'] = {{'calls': 10000, 'secs': 1}}\n")
    file.write(f"endpoints['{name}']['checkpoint'] = {{}}\n")
    file.write(f"endpoints['{name}']['checkpoint']['enabled'] = True\n")
    file.write(f"endpoints['{name}']['checkpoint']['selective'] = True\n")
    file.write(f"endpoints['{name}']['checkpoint']['days'] = 30\n")

def get_ror_id(name):
    url = f"https://api.ror.org/organizations?query={name}"
    response = requests.get(url, timeout=240)
    if response.status_code == 200:
        data = response.json()
        if data.get("items"):
            ror_id = data["items"][0]["id"]
            return ror_id
        else:
            #if for that org name not found
            return "Error: org id not found"
    else:
        return f"Error: {response.status_code}"

def main(ies_list):
    print(f"Starting repository search for {len(ies_list)} institutions: \n")
    candidates_urls = reposity_urls(ies_list)
    endpoints = dspace_validation(candidates_urls)
    return endpoints

In [7]:
# Run script
endpoints = main(ies_list)

save_to_json(endpoints, f"dspace_endpoints_{len(endpoints)}_ies.json")

# Open file for writing and truncate its content
config_file_name = "colombia_config.py"
with open(config_file_name, "w", encoding="utf-8") as file:
    file.truncate(0)  # Ensure the file is empty
    file.write("endpoints = {}\n")
    
    # Iterate over each endpoint and write its configuration
    for key, config in endpoints.items():
        print_config(
            name=key,
            url=config["url"],
            idx_url=config["identifiers_url"],
            org_name=config["name"],
            ror=config["ror"],
            enabled=config["enabled"],
            file=file
        )
    print(f"\nConfig file successfully saved: {config_file_name}")
    
filename = f"not_found_dspace_{len(not_found_dspace)}.json"
with open(filename, 'w', encoding='utf-8') as json_file:
    json.dump(list(not_found_dspace), json_file, ensure_ascii=False, indent=4)

Starting repository search for 326 institutions: 

0 - Universidad Nacional De Colombia - urls candidates for DSPACE repository have been found.
1 - Universidad Pedagogica Nacional - urls candidates for DSPACE repository have been found.
2 - Universidad Pedagogica Y Tecnologica De Colombia - Uptc - urls candidates for DSPACE repository have been found.
3 - Universidad Del Cauca - urls candidates for DSPACE repository have been found.
4 - Universidad Tecnologica De Pereira - Utp - urls candidates for DSPACE repository have been found.
5 - Universidad De Caldas - urls candidates for DSPACE repository have been found.
6 - Universidad De Cordoba - urls candidates for DSPACE repository have been found.
7 - Universidad Surcolombiana - urls candidates for DSPACE repository have been found.
8 - Universidad De La Amazonia - urls candidates for DSPACE repository have been found.
9 - Universidad Militar-Nueva Granada - urls candidates for DSPACE repository have been found.
10 - Universidad Tecnol

  soup = BeautifulSoup(req.text, features="xml")


https://bonga.unisimon.edu.co/server/oai/request?verb=Identify
https://bonga.unisimon.edu.co/server/oai/request?verb=Identify
https://bonga.unisimon.edu.co/oai/request?verb=Identify
https://bonga.unisimon.edu.co/oai/request?verb=Identify
https://bonga.unisimon.edu.co/oai/request?verb=Identify
96 - Instituto Universitario De Historia De Colombia - DSPACE repository not been found.


https://malaga.uis.edu.co/server/oai/request?verb=Identify
https://malaga.uis.edu.co/server/oai/request?verb=Identify
https://malaga.uis.edu.co/server/oai/request?verb=Identify
https://malaga.uis.edu.co/oai/request?verb=Identify
https://malaga.uis.edu.co/oai/request?verb=Identify
https://malaga.uis.edu.co/oai/request?verb=Identify
97 - Fundacion Universitaria De Garcia Rovira,Norte Y Gutierrez - DSPACE repository not been found.
https://noesis.uis.edu.co/server/oai/request?verb=Identify
https://manglar.uninorte.edu.co/server/oai/request?verb=Identify
https://manglar.uninorte.edu.co/server/oai/request?verb=Id

In [8]:
endpoints

{'dspace_unal': {'enabled': True,
  'name': 'UNIVERSIDAD NACIONAL DE COLOMBIA',
  'ror': 'https://ror.org/059yx9a68',
  'url': 'https://repositorio.unal.edu.cooai/request',
  'identifiers_url': 'https://repositorio.unal.edu.cooai/request?verb=ListIdentifiers&metadataPrefix=oai_dc',
  'request_timeout': 1.02},
 'dspace_pedagogica': {'enabled': True,
  'name': 'UNIVERSIDAD PEDAGOGICA NACIONAL',
  'ror': 'https://ror.org/023m5rq87',
  'url': 'http://repositorio.pedagogica.edu.cooai/request',
  'identifiers_url': 'http://repositorio.pedagogica.edu.cooai/request?verb=ListIdentifiers&metadataPrefix=oai_dc',
  'request_timeout': 0.42},
 'dspace_uptc': {'enabled': True,
  'name': 'UNIVERSIDAD PEDAGOGICA Y TECNOLOGICA DE COLOMBIA - UPTC',
  'ror': 'https://ror.org/04vdmbk59',
  'url': 'https://repositorio.uptc.edu.cooai/request',
  'identifiers_url': 'https://repositorio.uptc.edu.coserver/oai/request?verb=ListIdentifiers&metadataPrefix=oai_dc',
  'request_timeout': 0.38},
 'dspace_unicauca': {'