In [2]:
import datetime
import requests
from scholarly import scholarly
from tqdm import tqdm
import gender_guesser.detector as gender
import urllib.parse
import requests
from collections import Counter
import csv

In [3]:
# --- Configuration ---
YEARS_BACK = 5
gender_detector = gender.Detector()
OUTPUT_FILE = "C:/Users/d_j_w/OneDrive/David_Backup/ownCloud/gbci/results.csv"

# --- Helper functions ---
def get_openalex_author_id(author_name):
    base_url = "https://api.openalex.org/authors"
    encoded_name = urllib.parse.quote(author_name)
    filter_query = f"display_name.search:{encoded_name}"
    full_url = f"{base_url}?filter={filter_query}&per-page=25"

    print(f"\nüåê Querying OpenAlex with URL:\n{full_url}\n")

    response = requests.get(full_url)

    if response.status_code != 200:
        print(f"‚ùå Request failed with status {response.status_code}")
        return None

    data = response.json()
    results = data.get("results", [])

    if not results:
        print("‚ùå No results returned from OpenAlex.")
        return None

    print(f"üîç Candidates for '{author_name}':")
    for i, result in enumerate(results):
        display_name = result.get("display_name", "N/A")
        author_id = result.get("id", "N/A")
        works = result.get("works_count", "N/A")
        affiliation = result.get("last_known_institution", {}).get("display_name", "Unknown")
        print(f"{i+1}. {display_name} ‚Äî {author_id} | Works: {works} | Affiliation: {affiliation}")

    for result in results:
        if result["display_name"].strip().lower() == author_name.strip().lower():
            print(f"‚úÖ Exact match: {result['display_name']} ‚Äî {result['id']}")
            return result["id"]

    print(f"‚ö†Ô∏è No exact match for '{author_name}'. Using first candidate: {results[0]['display_name']} ‚Äî {results[0]['id']}")
    return results[0]["id"]

def get_recent_works(author_id, years=5, max_pages=1):
    import time
    from datetime import datetime
    import requests

    works = []
    per_page = 200  # Max OpenAlex allows
    current_year = datetime.now().year
    start_year = current_year - years
    page = 1

    while page <= max_pages:
        url = (
            f"https://api.openalex.org/works"
            f"?filter=author.id:{author_id},from_publication_date:{start_year}-01-01"
            f"&per-page={per_page}&page={page}"
        )
        print(f"üìÑ Fetching page {page} from OpenAlex...")
        response = requests.get(url)
        print(response)

        if response.status_code != 200:
            print(f"‚ùå Error fetching page {page}: {response.status_code}")
            break

        data = response.json()
        page_results = data.get("results", [])
        if not page_results:
            break

        works.extend(page_results)

        if "meta" in data and data["meta"].get("next_cursor") is None:
            break

        page += 1
        time.sleep(1)  # Respect rate limits

    print(f"‚úÖ Retrieved {len(works)} works in the last {years} years (max {max_pages} pages).")
    return works



def get_references(work):
    return work.get("referenced_works", [])

def get_author_pair(openalex_id):
    # Convert web URL to API URL if needed
    if openalex_id.startswith("https://openalex.org/"):
        openalex_id = openalex_id.replace("https://openalex.org/", "https://api.openalex.org/")

    r = requests.get(openalex_id)
    if r.status_code == 200:
        data = r.json()
        authorships = data.get("authorships", [])
        if authorships:
            first_author = authorships[0].get("author", {}).get("display_name")
            last_author = authorships[-1].get("author", {}).get("display_name")
            return first_author, last_author
    return None, None

def guess_gender(name):
    if not name:
        return None
    first = name.split()[0]
    return gender_detector.get_gender(first)

def simplify_gender(g):
    return 'W' if g in ['female', 'mostly_female'] else 'M' if g in ['male', 'mostly_male'] else None

def analyze_gcbi_openalex(author_name):
    print(f"\nüîç Analyzing: {author_name}")
    author_id = get_openalex_author_id(author_name)
    print(author_id)
    if not author_id:
        print(f"‚ùå Author not found: {author_name}")
        return None

    papers = get_recent_works(author_id)
    print(f"üìÑ Found {len(papers)} papers from last {YEARS_BACK} years.")

    all_refs = []
    for paper in tqdm(papers, desc="üîó Collecting references"):
        refs = get_references(paper)
        all_refs.extend(refs)

    pair_counts = Counter()

    for ref_id in tqdm(all_refs, desc="üë• Extracting author pairs"):
        first, last = get_author_pair(ref_id)
        if not first or not last:
            continue
        fg = simplify_gender(guess_gender(first))
        lg = simplify_gender(guess_gender(last))
        if fg and lg:
            key = fg + lg
            pair_counts[key] += 1

    total = sum(pair_counts.values())
    print(total)
    result = {
        "author": author_name,
        "total_refs": total,
        "WW": pair_counts.get("WW", 0),
        "WM": pair_counts.get("WM", 0),
        "MW": pair_counts.get("MW", 0),
        "MM": pair_counts.get("MM", 0),
    }
    return result

def save_results_to_csv(results, filename=OUTPUT_FILE):
    if not results:
        print("No data to save.")
        return

    with open(filename, mode="w", newline="", encoding="utf-8") as f:
        fieldnames = ["author", "total_refs", "WW", "WM", "MW", "MM"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in results:
            writer.writerow(row)
    print(f"\nüíæ Results saved to: {filename}")

In [None]:
# --- Main Execution ---
if __name__ == "__main__":
    academics = [
        "Caroline H. Williams‚ÄêGray",
        "Deborah Vickers"
    ]

    all_results = []
    for name in academics:
        result = analyze_gcbi_openalex(name)
        if result:
            all_results.append(result)

    save_results_to_csv(all_results)


üîç Analyzing: Caroline H. Williams‚ÄêGray

üåê Querying OpenAlex with URL:
https://api.openalex.org/authors?filter=display_name.search:Caroline%20H.%20Williams%E2%80%90Gray&per-page=25

üîç Candidates for 'Caroline H. Williams‚ÄêGray':
1. Caroline H. Williams‚ÄêGray ‚Äî https://openalex.org/A5005045661 | Works: 189 | Affiliation: Unknown
2. Williams-Gray Caroline H. ‚Äî https://openalex.org/A5108844086 | Works: 3 | Affiliation: Unknown
‚úÖ Exact match: Caroline H. Williams‚ÄêGray ‚Äî https://openalex.org/A5005045661
https://openalex.org/A5005045661
üìÑ Fetching page 1 from OpenAlex...
<Response [200]>
‚úÖ Retrieved 92 works in the last 5 years (max 1 pages).
üìÑ Found 92 papers from last 5 years.


üîó Collecting references: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 92/92 [00:00<00:00, 92094.50it/s]
üë• Extracting author pairs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5161/5161 [45:39<00:00,  1.88it/s]


3542

üîç Analyzing: Deborah Vickers

üåê Querying OpenAlex with URL:
https://api.openalex.org/authors?filter=display_name.search:Deborah%20Vickers&per-page=25

üîç Candidates for 'Deborah Vickers':
1. Deborah Vickers ‚Äî https://openalex.org/A5036578572 | Works: 126 | Affiliation: Unknown
2. Deborah Vickers ‚Äî https://openalex.org/A5040491509 | Works: 5 | Affiliation: Unknown
3. Deborah J. Vickers-Kirby ‚Äî https://openalex.org/A5004580747 | Works: 6 | Affiliation: Unknown
4. Deborah J Vickers ‚Äî https://openalex.org/A5053760644 | Works: 2 | Affiliation: Unknown
5. Deborah Vickers ‚Äî https://openalex.org/A5014262909 | Works: 2 | Affiliation: Unknown
6. Vickers Deborah ‚Äî https://openalex.org/A5053429137 | Works: 1 | Affiliation: Unknown
7. Deborah Ann Vickers ‚Äî https://openalex.org/A5012953594 | Works: 1 | Affiliation: Unknown
‚úÖ Exact match: Deborah Vickers ‚Äî https://openalex.org/A5036578572
https://openalex.org/A5036578572
üìÑ Fetching page 1 from OpenAlex...
<Response 

üîó Collecting references: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 55/55 [00:00<?, ?it/s]
üë• Extracting author pairs:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà         | 1521/1853 [14:38<03:30,  1.58it/s]

In [102]:
import requests


get_author_pair("https://openalex.org/W1596515083")

('Yoav Benjamini', 'Daniel Yekutieli')