# Code

In [1]:
import json, re, requests, gzip
from tqdm import tqdm
from urlextract import URLExtract
# import bibtexparser

In [2]:
# Initialize URL extractor and DOI pattern
url_extractor = URLExtract()
doi_pattern = r'\b(10\.\d{4,9}\/[-._;()/:A-Z0-9]+)\b'

def get_urls(content):
    """Extract URLs from text."""
    return url_extractor.find_urls(content)

def get_dois(content):
    """Extract DOIs based on regex."""
    return re.findall(doi_pattern, content)

# def get_bibtex_entries(content):
#     """Extract Bibtex entries."""
#     try:
#         bib_data = bibtexparser.loads(content)
#         return bib_data.entries
#     except:
#         return []

def fetch_readme(url, TIMEOUT = 10):
    """Fetch README content with timeout (default 10 secs) and fallback for GitHub."""
    try:
        response = requests.get(url, timeout=TIMEOUT)
        if response.status_code != 200 and 'github.com' in url:
            # Try alternative URL for GitHub main branch
            url = url.replace('/master/', '/main/')
            response = requests.get(url, timeout=TIMEOUT)
        return response.text if response.status_code == 200 else None
    except requests.RequestException:
        print(f"Skipping {url} due to timeout or connection error.")
        return None

def process_entry(id, source_type, base_url, readme_path):
    """Retrieve README, extract details, and organize data."""
    
    if source_type == "source":
        # Split the line to extract just the URL part
        _, line = id.split(';')
        # Construct the URL specifically for GitHub repos
        full_url = f"{base_url}{line[11:]}{readme_path}"
    else:
        # Use the original method for model and data
        full_url = f"{base_url}{id}{readme_path}"

    
    content = fetch_readme(full_url)
    if content:
        return {
            "id": id,
            "type": source_type,
            "url": full_url,
            "content": content.replace("\n", " "),
            "links": get_urls(content),
            "dois": get_dois(content)#,
            # "bibs": get_bibtex_entries(content)
        }
    return None

In [3]:

output_file = gzip.open("output/harshvar.json.gz", 'wt', encoding="utf-8")
source_files = {"model": "input/harshvar_model", "data": "input/harshvar_data", "source": "input/harshvar_source"}
base_urls = {
    "model": "https://huggingface.co/",
    "data": "https://huggingface.co/datasets/",
    "source": "https://raw.githubusercontent.com/"
}
readme_paths = {
    "model": "/raw/main/README.md",
    "data": "/raw/main/README.md",
    "source": "/master/README.md"
}

for source_type, input_file in source_files.items():
    with open(input_file, 'r') as f:
        for line in f:
            line = line.strip()
            entry = process_entry(line, source_type, base_urls[source_type], readme_paths[source_type])
            if entry:
                json.dump(entry, output_file)
                output_file.write("\n")
                print(f"Processed {entry['id']} from {source_type}.")
output_file.close()


Processed bartowski/dolphin-2.9.3-Yi-1.5-34B-32k-GGUF from model.
Processed mradermacher/Boundary-4x7b-MoE-i1-GGUF from model.
Processed Weyaxi/Einstein-v4-7B from model.
Processed EarthnDusk/May2023-Models from model.
Processed Vedx04/MetaMath-Mistral-7B-hendrycks from model.
Processed sudhanshu746/deepseek-7b-instruct-matho-finetune from model.
Processed leonn71/gte-Qwen2-1.5B-instruct-Q6_K-GGUF from model.
Processed AI4PD/ZymCTRL from model.
Processed KappaNeuro/director-darren-aronofsky-style from model.
Processed matheusgeda/testando from model.
Processed Theta-Lev/deepseek-math-7b-rl-Q8_0-GGUF from model.
Processed Zainiii/flanT5-738-lora-math from model.
Processed reyvan/bert_large_maths from model.
Processed jonas-luehrs/bert-base-cased-MLM-chemistry from model.
Processed torchgeo/resnet18_sentinel2_all_moco from model.
Processed Shaleen123/gemma2-9b-maths from model.
Processed maldv/electric-sheep-7b-alpha from model.
Processed TheBloke/OpenAssistant-Llama2-13B-Orca-8K-3319-GG

In [5]:
# see content of the file
import json
import gzip
with gzip.open("output/harshvar.json.gz", 'rt', encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(line)
        if i >= 2:
            break

{"id": "bartowski/dolphin-2.9.3-Yi-1.5-34B-32k-GGUF", "type": "model", "url": "https://huggingface.co/bartowski/dolphin-2.9.3-Yi-1.5-34B-32k-GGUF/raw/main/README.md", "content": "--- license: apache-2.0 base_model: 01-ai/Yi-1.5-34B-32k tags: - generated_from_trainer - axolotl datasets: - cognitivecomputations/Dolphin-2.9 - teknium/OpenHermes-2.5 - m-a-p/CodeFeedback-Filtered-Instruction - cognitivecomputations/dolphin-coder - cognitivecomputations/samantha-data - microsoft/orca-math-word-problems-200k - Locutusque/function-calling-chatml - internlm/Agent-FLAN quantized_by: bartowski pipeline_tag: text-generation ---  ## Llamacpp imatrix Quantizations of dolphin-2.9.3-Yi-1.5-34B-32k  Using <a href=\"https://github.com/ggerganov/llama.cpp/\">llama.cpp</a> release <a href=\"https://github.com/ggerganov/llama.cpp/releases/tag/b3197\">b3197</a> for quantization.  Original model: https://huggingface.co/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k  All quants made using imatrix option w