In [5]:
import requests
import pandas as pd
import os
import time

In [6]:

def get_projects_by_user(user_id):
    """
    Given a user ID (like 9neka), fetch all public projects for that user.
    """
    base_url = f"https://api.osf.io/v2/users/{user_id}/nodes/"
    projects = []
    while base_url:
        response = requests.get(base_url)
        response.raise_for_status()
        data = response.json()
        projects.extend(data['data'])
        base_url = data['links'].get('next')  # Handle pagination
    return projects

user_id = "9neka"
projects = get_projects_by_user(user_id)

print(f"Found {len(projects)} public projects:")
for proj in projects:
    print(f"- {proj['attributes']['title']} ({proj['id']})")


Found 67 public projects:
- Heat Watch Charlotte NC (86ume)
- Heat Watch Pierce County WA (x4sgn)
- Heat Watch Omaha (z7xw5)
- Heat Watch Asheville (mtxy5)
- Heat Watch Santiago (wx3r2)
- Heat Watch Toledo (h7jpb)
- Heat Watch Little Rock (vwsh9)
- Heat Watch Oklahoma City (e6qfa)
- Heat Watch Santa Fe NM (bhjfz)
- Heat Watch Iowa City &amp; Cedar Rapids (rbd3q)
- Heat Watch Brockton (rk75w)
- Heat Watch Chicago (6d7p2)
- Heat Watch Johnson County &amp; Wyandotte County (hxyrv)
- Heat Watch Scranton &amp; Wilkes Barre (8cnq4)
- Heat Watch MetroWest (Framingham MA) (hzqp3)
- Heat Watch Longmont (usp5x)
- Heat Watch Sedona (e5q7h)
- Heat Watch Salt Lake City (7k2u9)
- Heat Watch Rio de Janeiro (js28b)
- Heat Watch Freetown, Sierra Leone (8q7j3)
- Heat Watch San Francisco (ktr56)
- Heat Watch Nashville TN (msqv7)
- Heat Watch Knoxville TN (trn2y)
- Heat Watch Clark County NV (xsvjm)
- Heat Watch Columbus OH (n7t6x)
- Heat Watch Montgomery County (26ufw)
- Heat Watch Spokane (ef9wm)
- Heat

In [7]:
def get_project_metadata(project_id):
    url = f"https://api.osf.io/v2/nodes/{project_id}/"
    resp = requests.get(url)
    resp.raise_for_status()
    data = resp.json()['data']['attributes']
    return {
        "project_id": project_id,
        "title": data.get("title", ""),
        "description": data.get("description", ""),
        "date_created": data.get("date_created", ""),
        "date_modified": data.get("date_modified", ""),
        "public": data.get("public", False),
        "tags": ", ".join(data.get("tags", [])),
        "registration": data.get("registration", False),
        "project_url": f"https://osf.io/{project_id}/"
    }

def get_osfstorage_file_links(project_id):
    file_list = []
    base_url = f"https://api.osf.io/v2/nodes/{project_id}/files/osfstorage/"
    stack = [base_url]
    
    while stack:
        current_url = stack.pop()
        try:
            resp = requests.get(current_url)
            resp.raise_for_status()
        except Exception as e:
            print(f"Error fetching files for {project_id}: {e}")
            continue
            
        for item in resp.json().get('data', []):
            if item['attributes']['kind'] == 'file':
                file_list.append({
                    "file_name": item['attributes']['name'],
                    "file_path": item['attributes']['path'],
                    "file_size": item['attributes']['size'],
                    "download_url": item['links']['download']
                })
            elif item['attributes']['kind'] == 'folder':
                stack.append(item['relationships']['files']['links']['related']['href'])
    
    return file_list


In [8]:
# Replace with your real project list
project_ids = [proj['id'] for proj in projects]  # From your previous call
records = []

for pid in project_ids:
    try:
        meta = get_project_metadata(pid)
        files = get_osfstorage_file_links(pid)
        if not files:
            records.append({**meta, **{
                "file_name": "",
                "file_path": "",
                "file_size": "",
                "download_url": ""
            }})
        else:
            for f in files:
                records.append({**meta, **f})
    except Exception as e:
        print(f"Failed for project {pid}: {e}")
    time.sleep(0.3)  # Be nice to the API


In [9]:
output_dir = "osf_harvest"
os.makedirs(output_dir, exist_ok=True)
output_csv = os.path.join(output_dir, "osf_metadata_files.csv")

df = pd.DataFrame(records)
df.to_csv(output_csv, index=False)
print(f"Saved {len(df)} rows to {output_csv}")


Saved 393 rows to osf_harvest/osf_metadata_files.csv
