-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
biorxiv_03_process.py
51 lines (40 loc) · 1.46 KB
/
biorxiv_03_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import json
import os
import pandas as pd
from gender_guesser.detector import Detector
from tqdm import tqdm
from biorxiv_02_download_articles import ARTICLES_DIRECTORY, BIORXIV_DIRECTORY
def main():
detector = Detector(case_sensitive=False)
rows = []
for name in tqdm(os.listdir(ARTICLES_DIRECTORY)):
if not name.endswith('.json'):
continue
with open(os.path.join(ARTICLES_DIRECTORY, name)) as file:
j = json.load(file)
collection = j['collection']
if not collection:
tqdm.write(f'Empty collection for {name}')
continue
i = collection[0]
authors = i['authors'].split(';')
rows.append(dict(
id=i['doi'],
title=i['title'],
first_author_name=authors[0],
first_author_inferred_gender=fix_name(authors[0], detector),
license=i['license'],
category=i['category'].strip(),
posted=i['date'],
peer_reviewed=i['published'],
))
df = pd.DataFrame(rows).sort_values('posted')
df.to_csv(os.path.join(BIORXIV_DIRECTORY, 'articles.tsv'), sep='\t', index=False)
i = (df['first_author_inferred_gender'] != 'unknown').sum()
tqdm.write(f'Authors with assigned genders: {i}/{len(df.index)} ({i / len(df.index):.2%})')
def fix_name(s, detector):
if ',' in s:
return 'unknown'
return detector.get_gender(s.split(' ')[0])
if __name__ == '__main__':
main()