In [None]:
import json
import pandas as pd
import requests
from itertools import compress

Change the cell below to true if the data/CDR and data/OHW files ever need refreshing.

In [None]:
file_refresh = False

In [None]:
if file_refresh == True:

    # Refreshing CDR files

    # running this cell will output to the src folder. The files should be reviewed and moved to the data/CDR folder when done.
    search_dict = {
        'CDR_01.json':'carbon%20removal',
        'CDR_02.json':'carbon%20dioxide%20removal',
        'CDR_03.json':'carbon%20sequestration',
        'CDR_04.json':'carbon%20dioxide%20sequestration',
        'CDR_05.json':'carbon%20capture',
        'CDR_06.json':'carbon%20capture%20and%20sequestration',
        'CDR_07.json':'carbon%20capture%20and%20storage',
        'CDR_08.json':'carbon%20capture%20utilization%20and%20storage',
        'CDR_09.json':'CCS',
        'CDR_10.json':'CCUS',
        'CDR_11.json':'CDR',
        'CDR_12.json':'carbon%20mineralization',
        'CDR_13.json':'carbon%dioxide%20mineralization',
        'CDR_14.json':'carbon%20storage',
        'CDR_15.json':'direct%20air%20capture',
        'CDR_16.json':'direct%20air%20carbon%20capture',
    }

    filenames = []
    searches = []
    for item in search_dict.items():
        #print(item)
        key, value = item
        filename = key
        prefix = 'https://api.openalex.org/works?select=id,display_name,authorships,referenced_works,open_access&filter=from_publication_date:2020-01-01,to_publication_date:2023-08-01,fulltext.search:'
        insert = value
        suffix = '&page={}'
        url = prefix + insert + suffix
        filenames.append(filename)
        searches.append(url)

        #used_url = 'https://api.openalex.org/works?select=id,display_name,authorships,referenced_works,open_access&filter=from_publication_date:2020-01-01,to_publication_date:2023-08-01,fulltext.search:carbon%20removal&page={}'
        #print(url)

    for i in range(len(filenames)):

        filename = filenames[i]
        use_url = searches[i]

        print("on filename", filename)

        page = 1
        has_more_pages = True
        fewer_than_10k_results = True

        all_results = []

        # loop through pages
        while has_more_pages and fewer_than_10k_results:

            print("on page", str(page))
            
            # set page value and request page from OpenAlex
            url = use_url.format(page)
            #print('\n' + url)
            page_with_results = requests.get(url).json()
            
            # loop through partial list of results
            results = page_with_results['results']
            for i,work in enumerate(results):
                all_results.append(work)

            # next page
            page += 1
            
            # end loop when either there are no more results on the requested page 
            # or the next request would exceed 10,000 results
            per_page = page_with_results['meta']['per_page']
            has_more_pages = len(results) == per_page
            fewer_than_10k_results = per_page * page <= 10000

        with(open(filename, 'w')) as outf:
            json.dump(all_results, outf)



    # Refreshing OHW files

    # running this cell will output to the src folder. The files should be reviewed and moved to the data/CDR folder when done.
    OHW_OH_json = json.load(open('/workspaces/OHW_in_CDR/data/OHW/OHW_OH.json'))

    search_dict = {
        'OHW_OH.json':'open%20hardware',
        'OHW_OSH.json':'open%20source%20hardware'
    }

    filenames = []
    searches = []
    for item in search_dict.items():
        key, value = item
        filename = key
        prefix = 'https://api.openalex.org/works?select=id,display_name,authorships,referenced_works,open_access&filter=from_publication_date:2020-01-01,to_publication_date:2023-08-01,fulltext.search:'
        insert = value
        suffix = '&page={}'
        url = prefix + insert + suffix
        filenames.append(filename)
        searches.append(url)

    for i in range(len(filenames)):

        filename = filenames[i]
        use_url = searches[i]
        print("on filename", filename)

        page = 1
        has_more_pages = True
        fewer_than_10k_results = True

        all_results = []

        # loop through pages
        while has_more_pages and fewer_than_10k_results:

            print("on page", str(page))
            
            # set page value and request page from OpenAlex
            url = use_url.format(page)
            #print('\n' + url)
            page_with_results = requests.get(url).json()
            
            # loop through partial list of results
            results = page_with_results['results']
            for i,work in enumerate(results):
                all_results.append(work)

            # next page
            page += 1
            
            # end loop when either there are no more results on the requested page 
            # or the next request would exceed 10,000 results
            per_page = page_with_results['meta']['per_page']
            has_more_pages = len(results) == per_page
            fewer_than_10k_results = per_page * page <= 10000

        with(open(filename, 'w')) as outf:
            json.dump(all_results, outf)

## Combine files to make dataframes

In [None]:
file_nums = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16']
all_works = []

for i in range(len(file_nums)):

    file_num = file_nums[i]
    pathstr = '/workspaces/OHW_in_CDR/data/CDR/CDR_' + file_num + '.json'
    lst = json.load(open(pathstr))

    for r in lst:
        works = [r['id'], r['open_access']['is_oa'], r['referenced_works'], r['authorships']]
        all_works.append(works)

print(len(all_works))

df_works_CDR = pd.DataFrame(all_works)
df_works_CDR.head()

# 11798

In [None]:
file_nums = ['OH', 'OSH']
all_ohw_works = []

for i in range(len(file_nums)):

    file_num = file_nums[i]
    pathstr = '/workspaces/OHW_in_CDR/data/OHW/OHW_' + file_num + '.json'
    lst = json.load(open(pathstr))

    for r in lst:
        works = [r['id'], r['open_access']['is_oa'], r['referenced_works'], r['authorships']]
        all_ohw_works.append(works)

print(len(all_ohw_works))

df_works_OHW = pd.DataFrame(all_ohw_works)
df_works_OHW.head()

# 2351

### Separate into OA (open access) and NA (non-open access) objects

In [None]:
df_CDR_OA = df_works_CDR.loc[df_works_CDR[1] == True]
df_CDR_NA = df_works_CDR.loc[df_works_CDR[1] == False]
print(len(df_CDR_OA), len(df_CDR_NA))

# 5693 6105

df_OHW_OA = df_works_OHW.loc[df_works_OHW[1] == True]
df_OHW_NA = df_works_OHW.loc[df_works_OHW[1] == False]
print(len(df_OHW_OA), len(df_OHW_NA))

# 1215 1136

## Open hardware cited in the full text of co-authorsâ€™ works (2nd connection to OHW)

This is to answer the question of whether open hardware works added value to the work being published by being a work published by one or more co-authors, indicating likely awareness among the authors of open hardware because one of the co-authors has published OHW-related research.

### Functions

In [None]:
def get_author_ids(df):
    ids = []
    col = df[3]
    for r in col:
        work_authors = r
        for work_author in work_authors:
            ids.append(work_author['author']['id'])
    return ids

def get_author_objects(author_ids):
    searches = []
    author_objs = []

    for id in author_ids:
        prefix = 'https://api.openalex.org/authors?filter=ids.openalex:'
        insert = id
        suffix = '&page={}'
        url = prefix + insert + suffix
        searches.append(url)

    for i in range(len(searches)):
        use_url = searches[i]
        print("on search", str(i))

        page = 1
        has_more_pages = True
        fewer_than_10k_results = True

        # loop through pages
        while has_more_pages and fewer_than_10k_results:

            #print("on page", str(page))
            
            # set page value and request page from OpenAlex
            url = use_url.format(page)
            #print('\n' + url)
            page_with_results = requests.get(url).json()
            
            # loop through partial list of results
            results = page_with_results['results']
            for i,author in enumerate(results):
                author_objs.append(author)

            # next page
            page += 1
            
            # end loop when either there are no more results on the requested page 
            # or the next request would exceed 10,000 results
            per_page = page_with_results['meta']['per_page']
            has_more_pages = len(results) == per_page
            fewer_than_10k_results = per_page * page <= 10000

    return author_objs
    
def get_citation_metrics(author_ids, author_objs):
    saved_summary_stats = []
    #print("in citation metrics")
    for r in author_objs:
        id = r['id']
        ss_obj = r['summary_stats']
        #print(ss_obj)
        summary_stats = [id, ss_obj['2yr_mean_citedness'], ss_obj['h_index'], ss_obj['i10_index']]
        #print(summary_stats)
        if summary_stats[0] in author_ids:
            saved_summary_stats.append(summary_stats)
    df_summary_stats = pd.DataFrame(saved_summary_stats)
    print(df_summary_stats.head())

    return df_summary_stats

def get_ref_ids(df):
    refs = []
    col = df[2]
    for r in col:
        work_refs = r
        for work_ref in work_refs:
            refs.append(work_ref)
    return refs

def get_ids(df):
    ids = []
    col = df[0]
    for r in col:
        id = r
        ids.append(id)
    return ids


In [None]:
# for assessing author matches
cdr_oa_author_ids = get_author_ids(df_CDR_OA)
cdr_na_author_ids = get_author_ids(df_CDR_NA)
ohw_oa_author_ids = get_author_ids(df_OHW_OA)
ohw_na_author_ids = get_author_ids(df_OHW_NA)

# for assessing referenced work matches
cdr_oa_ref_ids = get_ref_ids(df_CDR_OA)
cdr_na_ref_ids = get_ref_ids(df_CDR_NA)
ohw_oa_ids = get_ids(df_OHW_OA)
ohw_na_ids = get_ids(df_OHW_NA)

print(len(all_works), len(all_ohw_works))
print(len(cdr_oa_author_ids), len(cdr_na_author_ids))
print(len(ohw_oa_author_ids), len(ohw_na_author_ids))
print(len(ohw_oa_ids), len(ohw_na_ids))

# 11798 2351
# 33151 28766
# 6695 4741
# 1215 1136

### Look for matches between the CDR author IDs and the OHW author IDs

In [None]:
ohw_author_ids = []
for id in ohw_oa_author_ids:
    ohw_author_ids.append(id)
for id in ohw_na_author_ids:
    ohw_author_ids.append(id)

cdr_ohw_oa_author_matches = (el in cdr_oa_author_ids for el in ohw_author_ids)
cdr_ohw_na_author_matches = (el in cdr_na_author_ids for el in ohw_author_ids)

cdr_ohw_oa_author_match_count = sum(cdr_ohw_oa_author_matches)
cdr_ohw_na_author_match_count = sum(cdr_ohw_na_author_matches)
print(cdr_ohw_oa_author_match_count, cdr_ohw_na_author_match_count)

# 1027 720, 16.7 s

### For those matches, get the author ID to see later how their metrics performed

In [None]:
cdr_ohw_oa_author_matches = [el in cdr_oa_author_ids for el in ohw_author_ids]
match_idxs = list(compress(range(len(cdr_ohw_oa_author_matches)), cdr_ohw_oa_author_matches))
cdr_ohw_oa_match_author_ids = [cdr_oa_author_ids[i] for i in match_idxs]
print(cdr_ohw_oa_match_author_ids)

cdr_ohw_na_author_matches = [el in cdr_na_author_ids for el in ohw_author_ids]
match_idxs = list(compress(range(len(cdr_ohw_na_author_matches)), cdr_ohw_na_author_matches))
cdr_ohw_na_match_author_ids = [cdr_na_author_ids[i] for i in match_idxs]
print(cdr_ohw_na_match_author_ids)

In [None]:
cdr_ohw_oa_author_nonmatches = [not el in cdr_oa_author_ids for el in ohw_author_ids]
match_idxs = list(compress(range(len(cdr_ohw_oa_author_nonmatches)), cdr_ohw_oa_author_nonmatches))
cdr_ohw_oa_nonmatch_author_ids = [cdr_oa_author_ids[i] for i in match_idxs]
print(cdr_ohw_oa_nonmatch_author_ids)

cdr_ohw_na_author_nonmatches = [not el in cdr_na_author_ids for el in ohw_author_ids]
match_idxs = list(compress(range(len(cdr_ohw_na_author_nonmatches)), cdr_ohw_na_author_nonmatches))
cdr_ohw_na_nonmatch_author_ids = [cdr_na_author_ids[i] for i in match_idxs]
print(cdr_ohw_na_nonmatch_author_ids)

In [None]:
# for assessing whether OHW is helpful for citation metrics
# each call returns a tuple of (ids, impact_factors, h_indices, i10_indices)
cdr_ohw_oa_nonmatch_author_ids_trunc = cdr_ohw_oa_nonmatch_author_ids[0:len(cdr_ohw_oa_match_author_ids)]
cdr_ohw_na_nonmatch_author_ids_trunc = cdr_ohw_na_nonmatch_author_ids[0:len(cdr_ohw_na_match_author_ids)]

In [None]:
cdr_ohw_oa_author_objs = get_author_objects(cdr_ohw_oa_match_author_ids)
cdr_ohw_na_author_objs = get_author_objects(cdr_ohw_na_match_author_ids)
# 7 m 8.9 s

In [None]:
cdr_nohw_oa_author_objs = get_author_objects(cdr_ohw_oa_nonmatch_author_ids_trunc)
cdr_nohw_na_author_objs = get_author_objects(cdr_ohw_na_nonmatch_author_ids_trunc)
# 7 m 8.7 s

In [None]:
cdr_ohw_oa_metrics = get_citation_metrics(cdr_ohw_oa_match_author_ids, cdr_ohw_oa_author_objs)
cdr_ohw_na_metrics = get_citation_metrics(cdr_ohw_na_match_author_ids, cdr_ohw_na_author_objs)
cdr_nohw_oa_metrics = get_citation_metrics(cdr_ohw_oa_nonmatch_author_ids, cdr_nohw_oa_author_objs)
cdr_nohw_na_metrics = get_citation_metrics(cdr_ohw_na_nonmatch_author_ids, cdr_nohw_na_author_objs)

In [None]:
#ohw_oa_metrics = get_citation_metrics2(ohw_oa_author_ids, get_author_objects(ohw_oa_author_ids))
#ohw_na_metrics = get_citation_metrics2(ohw_na_author_ids, get_author_objects(ohw_na_author_ids))

### Get averages for citation metrics for each group

In [None]:
# Impact Factors
avg_if_ohw_oa = cdr_ohw_oa_metrics[1].mean()
avg_if_ohw_na = cdr_ohw_na_metrics[1].mean()
avg_if_nohw_oa = cdr_nohw_oa_metrics[1].mean()
avg_if_nohw_na = cdr_nohw_na_metrics[1].mean()

# H-Indices
avg_h_ohw_oa = cdr_ohw_oa_metrics[2].mean()
avg_h_ohw_na = cdr_ohw_na_metrics[2].mean()
avg_h_nohw_oa = cdr_nohw_oa_metrics[2].mean()
avg_h_nohw_na = cdr_nohw_na_metrics[2].mean()

# I10-Indices
avg_i10_ohw_oa = cdr_ohw_oa_metrics[3].mean()
avg_i10_ohw_na = cdr_ohw_na_metrics[3].mean()
avg_i10_nohw_oa = cdr_nohw_oa_metrics[3].mean()
avg_i10_nohw_na = cdr_nohw_na_metrics[3].mean()

# Print results
print("IF", avg_if_ohw_oa, avg_if_ohw_na, avg_if_nohw_oa, avg_if_nohw_na)
print("H", avg_h_ohw_oa, avg_h_ohw_na, avg_h_nohw_oa, avg_h_nohw_na)
print("I10", avg_i10_ohw_oa, avg_i10_ohw_na, avg_i10_nohw_oa, avg_i10_nohw_na)

## Open hardware cited in the title or abstract of the referenced works (3rd connection to OHW)

This is to answer the question of whether one or more open hardware works added value to the work being published by being included as a referenced work.

In [None]:
ohw_ids = []
for id in ohw_oa_ids:
    ohw_ids.append(id)
for id in ohw_na_ids:
    ohw_ids.append(id)

cdr_ohw_oa_ref_match_count = sum(el in cdr_oa_ref_ids for el in ohw_ids)
cdr_ohw_na_ref_match_count = sum(el in cdr_na_ref_ids for el in ohw_ids)
print(cdr_ohw_oa_ref_match_count, cdr_ohw_na_ref_match_count)

# 9341 6183 14 minutes
# 14 4 26.4 s