In [1]:
import json
import pandas as pd
import requests

Run the cell below once if data/CDR files ever need refreshing.

In [13]:
# running this cell will output to the src folder. The files should be reviewed and moved to the data/CDR folder when done.
search_dict = {
    'CDR_01.json':'carbon%20removal',
    'CDR_02.json':'carbon%20dioxide%20removal',
    'CDR_03.json':'carbon%20sequestration',
    'CDR_04.json':'carbon%20dioxide%20sequestration',
    'CDR_05.json':'carbon%20capture',
    'CDR_06.json':'carbon%20capture%20and%20sequestration',
    'CDR_07.json':'carbon%20capture%20and%20storage',
    'CDR_08.json':'carbon%20capture%20utilization%20and%20storage',
    'CDR_09.json':'CCS',
    'CDR_10.json':'CCUS',
    'CDR_11.json':'CDR',
    'CDR_12.json':'carbon%20mineralization',
    'CDR_13.json':'carbon%dioxide%20mineralization',
    'CDR_14.json':'carbon%20storage',
    'CDR_15.json':'direct%20air%20capture',
    'CDR_16.json':'direct%20air%20carbon%20capture',
}

filenames = []
searches = []
for item in search_dict.items():
    #print(item)
    key, value = item
    filename = key
    prefix = 'https://api.openalex.org/works?select=id,display_name,authorships,referenced_works,open_access&filter=from_publication_date:2020-01-01,to_publication_date:2023-08-01,fulltext.search:'
    insert = value
    suffix = '&page={}'
    url = prefix + insert + suffix
    filenames.append(filename)
    searches.append(url)

    #used_url = 'https://api.openalex.org/works?select=id,display_name,authorships,referenced_works,open_access&filter=from_publication_date:2020-01-01,to_publication_date:2023-08-01,fulltext.search:carbon%20removal&page={}'
    #print(url)

for i in range(len(filenames)):

    filename = filenames[i]
    use_url = searches[i]

    print("on filename", filename)

    page = 1
    has_more_pages = True
    fewer_than_10k_results = True

    all_results = []

    # loop through pages
    while has_more_pages and fewer_than_10k_results:

        print("on page", str(page))
        
        # set page value and request page from OpenAlex
        url = use_url.format(page)
        #print('\n' + url)
        page_with_results = requests.get(url).json()
        
        # loop through partial list of results
        results = page_with_results['results']
        for i,work in enumerate(results):
            all_results.append(work)

        # next page
        page += 1
        
        # end loop when either there are no more results on the requested page 
        # or the next request would exceed 10,000 results
        per_page = page_with_results['meta']['per_page']
        has_more_pages = len(results) == per_page
        fewer_than_10k_results = per_page * page <= 10000

    with(open(filename, 'w')) as outf:
        json.dump(all_results, outf)


on filename CDR_01.json
on page 1
on page 2
on page 3
on page 4
on page 5
on page 6
on page 7
on page 8
on page 9
on page 10
on page 11
on page 12
on page 13
on page 14
on page 15
on page 16
on page 17
on page 18
on page 19
on page 20
on page 21
on page 22
on page 23
on page 24
on page 25
on page 26
on page 27
on page 28
on page 29
on page 30
on page 31
on page 32
on page 33
on page 34
on page 35
on page 36
on page 37
on page 38
on page 39
on page 40
on page 41
on page 42
on page 43
on page 44
on page 45
on page 46
on page 47
on page 48
on page 49
on page 50
on page 51
on page 52
on page 53
on page 54
on page 55
on page 56
on page 57
on page 58
on page 59
on page 60
on page 61
on page 62
on page 63
on page 64
on page 65
on page 66
on page 67
on page 68
on page 69
on page 70
on page 71
on page 72
on page 73
on page 74
on page 75
on page 76
on page 77
on page 78
on page 79
on page 80
on page 81
on page 82
on page 83
on page 84
on page 85
on page 86
on page 87
on page 88
on page 89
on fil

In [34]:
file_nums = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16']
all_works = []

for i in range(len(file_nums)):

    file_num = file_nums[i]
    pathstr = '/workspaces/OHW_in_CDR/data/CDR/CDR_' + file_num + '.json'
    lst = json.load(open(pathstr))

    for r in lst:
        works = [r['id'], r['open_access']['is_oa'], r['referenced_works'], r['authorships']]
        all_works.append(works)


In [35]:
print(len(all_works))

# 11798

11798


## Combine into a single CDR works list

In [36]:
df_works_CDR = pd.DataFrame(all_works)
df_works_CDR.head()

Unnamed: 0,0,1,2,3
0,https://openalex.org/W2965283917,False,"[https://openalex.org/W167104987, https://open...","[{'author_position': 'first', 'author': {'id':..."
1,https://openalex.org/W2965654741,False,"[https://openalex.org/W1149082768, https://ope...","[{'author_position': 'first', 'author': {'id':..."
2,https://openalex.org/W2963817167,False,"[https://openalex.org/W1530222901, https://ope...","[{'author_position': 'first', 'author': {'id':..."
3,https://openalex.org/W2960882361,False,"[https://openalex.org/W207497656, https://open...","[{'author_position': 'first', 'author': {'id':..."
4,https://openalex.org/W2972174901,False,"[https://openalex.org/W432734680, https://open...","[{'author_position': 'first', 'author': {'id':..."


### Separate into OA (open access) and NA (non-open access) objects

In [37]:
df_CDR_OA = df_works_CDR.loc[df_works_CDR[1] == True]
df_CDR_NA = df_works_CDR.loc[df_works_CDR[1] == False]
print(len(df_CDR_OA), len(df_CDR_NA))

# 5693 6105

5693 6105


## Obtaining the OHW list

Run the cell below once if data/OHW files ever need refreshing.

In [6]:
# running this cell will output to the src folder. The files should be reviewed and moved to the data/CDR folder when done.
OHW_OH_json = json.load(open('/workspaces/OHW_in_CDR/data/OHW/OHW_OH.json'))

search_dict = {
    'OHW_OH.json':'open%20hardware',
    'OHW_OSH.json':'open%20source%20hardware'
}

filenames = []
searches = []
for item in search_dict.items():
    key, value = item
    filename = key
    prefix = 'https://api.openalex.org/works?select=id,display_name,authorships,referenced_works,open_access&filter=from_publication_date:2020-01-01,to_publication_date:2023-08-01,fulltext.search:'
    insert = value
    suffix = '&page={}'
    url = prefix + insert + suffix
    filenames.append(filename)
    searches.append(url)

for i in range(len(filenames)):

    filename = filenames[i]
    use_url = searches[i]
    print("on filename", filename)

    page = 1
    has_more_pages = True
    fewer_than_10k_results = True

    all_results = []

    # loop through pages
    while has_more_pages and fewer_than_10k_results:

        print("on page", str(page))
        
        # set page value and request page from OpenAlex
        url = use_url.format(page)
        #print('\n' + url)
        page_with_results = requests.get(url).json()
        
        # loop through partial list of results
        results = page_with_results['results']
        for i,work in enumerate(results):
            all_results.append(work)

        # next page
        page += 1
        
        # end loop when either there are no more results on the requested page 
        # or the next request would exceed 10,000 results
        per_page = page_with_results['meta']['per_page']
        has_more_pages = len(results) == per_page
        fewer_than_10k_results = per_page * page <= 10000

    with(open(filename, 'w')) as outf:
        json.dump(all_results, outf)

on filename OHW_OH.json
on page 1
on page 2
on page 3
on page 4
on page 5
on page 6
on page 7
on page 8
on page 9
on page 10
on page 11
on page 12
on page 13
on page 14
on page 15
on page 16
on page 17
on page 18
on page 19
on page 20
on page 21
on page 22
on page 23
on page 24
on page 25
on page 26
on page 27
on page 28
on page 29
on page 30
on page 31
on page 32
on page 33
on page 34
on page 35
on page 36
on page 37
on page 38
on page 39
on page 40
on page 41
on page 42
on page 43
on page 44
on page 45
on page 46
on page 47
on page 48
on page 49
on page 50
on page 51
on page 52
on page 53
on page 54
on page 55
on page 56
on filename OHW_OSH.json
on page 1
on page 2
on page 3
on page 4
on page 5
on page 6
on page 7
on page 8
on page 9
on page 10
on page 11
on page 12
on page 13
on page 14
on page 15
on page 16
on page 17
on page 18
on page 19
on page 20
on page 21
on page 22
on page 23
on page 24
on page 25
on page 26
on page 27
on page 28
on page 29
on page 30
on page 31
on page 32
o

In [7]:
file_nums = ['OH', 'OSH']
all_ohw_works = []

for i in range(len(file_nums)):

    file_num = file_nums[i]
    pathstr = '/workspaces/OHW_in_CDR/data/OHW/OHW_' + file_num + '.json'
    lst = json.load(open(pathstr))

    for r in lst:
        works = [r['id'], r['open_access']['is_oa'], r['referenced_works'], r['authorships']]
        all_ohw_works.append(works)

print(len(all_ohw_works))

df_works_OHW = pd.DataFrame(all_ohw_works)
df_works_OHW.head()

# 2351

2351


Unnamed: 0,0,1,2,3
0,https://openalex.org/W2962730651,True,"[https://openalex.org/W23953656, https://opena...","[{'author_position': 'first', 'author': {'id':..."
1,https://openalex.org/W3007172120,True,"[https://openalex.org/W625729589, https://open...","[{'author_position': 'first', 'author': {'id':..."
2,https://openalex.org/W2969697338,True,"[https://openalex.org/W1593082705, https://ope...","[{'author_position': 'first', 'author': {'id':..."
3,https://openalex.org/W2892013400,True,[],"[{'author_position': 'first', 'author': {'id':..."
4,https://openalex.org/W2922347011,False,"[https://openalex.org/W424832857, https://open...","[{'author_position': 'first', 'author': {'id':..."


### Separate into OA (open access) and NA (non-open access) objects

In [8]:
df_OHW_OA = df_works_OHW.loc[df_works_OHW[1] == True]
df_OHW_NA = df_works_OHW.loc[df_works_OHW[1] == False]
print(len(df_OHW_OA), len(df_OHW_NA))

# 1215 1136

1215 1136


## Open hardware cited in the full text of co-authorsâ€™ works (2nd connection to OHW)

This is to answer the question of whether open hardware works added value to the work being published by being a work published by one or more co-authors, indicating likely awareness among the authors of open hardware because one of the co-authors has published OHW-related research.

In [45]:
def get_author_ids(df):
    ids = []
    col = df[3]
    for r in col:
        work_authors = r
        for work_author in work_authors:
            ids.append(work_author['author']['id'])
    return ids

def get_citation_metrics(df):
    ids = []
    impact_factors = []
    h_indices = []
    i10_indices = []
    col = df[3]
    for r in col:
        work_authors = r
        for work_author in work_authors:
            ids.append(work_author['author']['id'])
            impact_factors.append(work_author['author']['summary_stats']['2yr_mean_citedness'])
            h_indices.append(work_author['author']['summary_stats']['h_index'])
            i10_indices.append(work_author['author']['summary_stats']['i10_index'])
    return (ids, impact_factors, h_indices, i10_indices)

def get_ref_ids(df):
    refs = []
    col = df[2]
    for r in col:
        work_refs = r
        for work_ref in work_refs:
            refs.append(work_ref)
    return refs

def get_ids(df):
    ids = []
    col = df[0]
    for r in col:
        id = r
        ids.append(id)
    return ids


# for assessing author matches
cdr_oa_author_ids = get_author_ids(df_CDR_OA)
cdr_na_author_ids = get_author_ids(df_CDR_NA)
ohw_oa_author_ids = get_author_ids(df_OHW_OA)
ohw_na_author_ids = get_author_ids(df_OHW_NA)

# for assessing referenced work matches
cdr_oa_ref_ids = get_ref_ids(df_CDR_OA)
cdr_na_ref_ids = get_ref_ids(df_CDR_NA)
ohw_oa_ids = get_ids(df_OHW_OA)
ohw_na_ids = get_ids(df_OHW_NA)

# for assessing whether OHW is helpful for citation metrics
# each call returns a tuple of (ids, impact_factors, h_indices, i10_indices)
cdr_oa_ref_ids = get_citation_metrics(df_CDR_OA)
cdr_na_ref_ids = get_citation_metrics(df_CDR_NA)
ohw_oa_ids = get_citation_metrics(df_OHW_OA)
ohw_na_ids = get_citation_metrics(df_OHW_NA)

print(len(all_works), len(all_ohw_works))
print(len(cdr_oa_author_ids), len(cdr_na_author_ids))
print(len(ohw_oa_author_ids), len(ohw_na_author_ids))
print(len(ohw_oa_ids), len(ohw_na_ids))

# 11798 2351
# 33151 28766
# 6695 4741
# 1215 1136


11798 2351
33151 28766
6695 4741
1215 1136


### Look for matches between the CDR author IDs and the OHW author IDs

In [39]:
ohw_author_ids = []
for id in ohw_oa_author_ids:
    ohw_author_ids.append(id)
for id in ohw_na_author_ids:
    ohw_author_ids.append(id)

cdr_ohw_oa_author_match_count = sum(el in cdr_oa_author_ids for el in ohw_author_ids)
cdr_ohw_na_author_match_count = sum(el in cdr_na_author_ids for el in ohw_author_ids)
print(cdr_ohw_oa_author_match_count, cdr_ohw_na_author_match_count)

# 1027 720, 16.7 s

1027 720


## Open hardware cited in the title or abstract of the referenced works (3rd connection to OHW)

This is to answer the question of whether one or more open hardware works added value to the work being published by being included as a referenced work.

In [46]:
ohw_ids = []
for id in ohw_oa_ids:
    ohw_ids.append(id)
for id in ohw_na_ids:
    ohw_ids.append(id)

cdr_ohw_oa_ref_match_count = sum(el in cdr_oa_ref_ids for el in ohw_ids)
cdr_ohw_na_ref_match_count = sum(el in cdr_na_ref_ids for el in ohw_ids)
print(cdr_ohw_oa_ref_match_count, cdr_ohw_na_ref_match_count)

# 9341 6183 14 minutes
# 14 4 26.4 s

14 4


: 