In [16]:
import pandas as pd 
import numpy as np 

In [28]:
chemistry = pd.read_csv('Chemistry publication record.csv')
chemistry["field"] = "chemistry"
medicine = pd.read_csv('Medicine publication record.csv', encoding='latin1')
medicine["field"] = "medicine"
physics = pd.read_csv('Physics publication record.csv', encoding='latin1')
physics["field"] = "physics"

data = pd.concat([chemistry, medicine, physics])

In [48]:
data.to_csv("nobel_basic.csv")

In [54]:
data["DOI"].drop_duplicates().apply(lambda x: f"https://doi.org/{x}").to_csv("nobel_doi.txt", index=False)

In [122]:
prize_paper = pd.read_csv("nobel_paper.csv")

In [123]:
prize_paper["Affiliation"].value_counts()

harvard university                                      32
stanford university                                     25
rockefeller university                                  21
max planck society                                      20
california institute of technology                      19
                                                        ..
imperial chemical industries                             1
new england biolabs                                      1
university of texas health science center at houston     1
university of gothenburg                                 1
university of leeds                                      1
Name: Affiliation, Length: 153, dtype: int64

In [124]:
institution_data = pd.DataFrame(prize_paper["Affiliation"].value_counts())

In [125]:
institution_data["Affiliation"].sum()

536

In [None]:
def get_location():
    from geopy.geocoders import Nominatim
    from geopy.exc import GeocoderTimedOut
    import json
    import time
    from tqdm import tqdm

    geolocator = Nominatim(user_agent="nobel_viz", timeout=10)  # Increase timeout to 10 seconds
    institutions = institution_data.index

    geo_data = []
    for inst in tqdm(institutions, desc="Geocoding Institutions", unit="institution"):
        retries = 3
        for attempt in range(retries):
            try:
                location = geolocator.geocode(inst)
                if location:
                    geo_data.append({
                        "affiliation": inst,
                        "latitude": location.latitude,
                        "longitude": location.longitude,
                        "city": location.raw.get("address", {}).get("city", ""),
                        "country": location.raw.get("address", {}).get("country", "")
                    })
                else:
                    print(f"No location found for {inst}")
                break  # Success, move to next institution
            except GeocoderTimedOut as e:
                print(f"Timeout for {inst}, attempt {attempt + 1}/{retries}: {e}")
                if attempt < retries - 1:
                    time.sleep(2)  # Wait before retrying
                else:
                    print(f"Failed to geocode {inst} after {retries} attempts")
            except Exception as e:
                print(f"Error for {inst}: {e}")
                break
            time.sleep(1)  # Respect rate limit

    with open("geo_data.json", "w") as f:
        json.dump(geo_data, f, indent=2)

# get_location()

Geocoding Institutions:  19%|█▉        | 29/153 [00:30<02:01,  1.02institution/s]

No location found for carnegie institution for science


Geocoding Institutions:  31%|███       | 47/153 [00:48<01:46,  1.00s/institution]

No location found for catholic university of leuven


Geocoding Institutions:  41%|████      | 63/153 [01:04<01:37,  1.09s/institution]

No location found for french institute of health and medical research


Geocoding Institutions:  59%|█████▉    | 90/153 [01:35<01:12,  1.15s/institution]

No location found for atomic energy of canada limited


Geocoding Institutions:  61%|██████▏   | 94/153 [01:39<01:05,  1.12s/institution]

No location found for french academy of sciences


Geocoding Institutions:  76%|███████▋  | 117/153 [02:02<00:39,  1.10s/institution]

No location found for german cancer research center


Geocoding Institutions:  78%|███████▊  | 119/153 [02:04<00:35,  1.03s/institution]

No location found for lister institute of preventive medicine


Geocoding Institutions:  88%|████████▊ | 134/153 [02:19<00:21,  1.12s/institution]

No location found for agency for science technology and research


Geocoding Institutions:  91%|█████████ | 139/153 [02:24<00:15,  1.09s/institution]

No location found for walter and eliza hall institute of medical research


Geocoding Institutions:  96%|█████████▌| 147/153 [02:32<00:06,  1.01s/institution]

No location found for basel institute for immunology


Geocoding Institutions:  99%|█████████▊| 151/153 [02:36<00:02,  1.18s/institution]

No location found for university of texas health science center at houston


Geocoding Institutions: 100%|██████████| 153/153 [02:38<00:00,  1.03s/institution]


In [75]:
institution_data.columns = ["times"]
institution_data

Unnamed: 0,times
harvard university,32
stanford university,25
rockefeller university,21
max planck society,20
california institute of technology,19
...,...
imperial chemical industries,1
new england biolabs,1
university of texas health science center at houston,1
university of gothenburg,1


In [None]:
def get_times():
    import json

    times_data = dict(zip(institution_data.index, institution_data['times']))


    with open('geo_data.json', 'r', encoding='utf-8') as f:
        geo_data = json.load(f)

    # 整合 times 数据
    for item in geo_data:
        affiliation = item['affiliation']
        # 添加 times 字段，默认为0（如果机构不在times_data中）
        item['times'] = times_data.get(affiliation, 0)

    # 保存为 times.json
    with open('times.json', 'w', encoding='utf-8') as f:
        json.dump(geo_data, f, indent=2, ensure_ascii=False)

    print("Integrated DataFrame times data into times.json")
    
# get_times()

In [129]:
data["Laureate name"].value_counts()

olah, g            1853
brown, h           1630
corey, e           1401
schally, av        1313
barton, d          1278
                   ... 
victor grignard       1
von euler, h          1
heisenberg, w         1
victor f. hess        1
loewi, o              1
Name: Laureate name, Length: 543, dtype: int64

In [130]:
data["Affiliation"].value_counts()

harvard university                      4773
max planck society                      3950
stanford university                     3196
california institute of technology      2646
university of southern california       2137
                                        ... 
the catholic university of america         1
west virginia university                   1
public health laboratory                   1
mount sinai hospital                       1
city of hope national medical center       1
Name: Affiliation, Length: 453, dtype: int64

In [131]:
data

Unnamed: 0,Laureate ID,Laureate name,Prize year,Title,Pub year,Paper ID,DOI,Journal,Affiliation,Is prize-winning paper,field
0,20001,"stoddart, j",2016,a molecular shuttle,1991.0,1.976039e+09,10.1021/ja00013a096,journal of the american chemical society,northwestern university,YES,chemistry
1,20001,"stoddart, j",2016,chemical synthesis of nanostructures,1993.0,1.963538e+09,10.1557/PROC-330-57,mrs proceedings,northwestern university,NO,chemistry
2,20001,"stoddart, j",2016,formation and x ray crystal structure of pt h2...,1981.0,1.963552e+09,10.1039/C39810000851,journal of the chemical society chemical commu...,northwestern university,NO,chemistry
3,20001,"stoddart, j",2016,single walled carbon nanotubes under the influ...,2005.0,2.095637e+09,10.1002/smll.200400070,small,northwestern university,NO,chemistry
4,20001,"stoddart, j",2016,synthesis of medium heterocyclic rings from 6 ...,1974.0,2.095679e+09,10.1016/S0008-6215(00)82105-9,carbohydrate research,northwestern university,NO,chemistry
...,...,...,...,...,...,...,...,...,...,...,...
21499,10193,"zeeman, p",1902,The Effect of Magnetisation on the Nature of L...,1897.0,,,,,YES,physics
21500,10193,"zeeman, p",1902,The Influence of a Magnetic Field on Radiation...,1896.0,,,,,YES,physics
21501,10193,"zeeman, p",1902,zur hyperfeinstruktur des wismuts,1930.0,2.058955e+09,10.1007/BF01397521,european physical journal,,NO,physics
21502,10193,"zeeman, p",1902,on the spectrum of ionised potassium in connex...,1924.0,2.013742e+09,10.1038/114352a0,nature,university of amsterdam,NO,physics


In [157]:

# 处理 Pub year 缺失值，丢弃包含 NaN 的行
df = data.dropna(subset=['Pub year'])

# 按 Laureate name 和 field 聚合总发文量
scientist_data = df.groupby(['Laureate name', 'field', 'Prize year']).size().reset_index(name='times')

# 计算 prize_paper_ranks（所有获奖论文的位置）
scientist_data['prize_paper_ranks'] = [[] for _ in range(len(scientist_data))]
for idx, row in scientist_data.iterrows():
    laureate = row['Laureate name']
    field = row['field']
    papers = df[(df['Laureate name'] == laureate) & (df['field'] == field)]
    papers = papers.sort_values('Pub year').reset_index(drop=True)  # 重置索引
    prize_indices = papers[papers['Is prize-winning paper'] == 'YES'].index
    ranks = [int(i) + 1 for i in prize_indices]  # 基于排序后的位置
    scientist_data.at[idx, 'prize_paper_ranks'] = ranks if len(ranks) > 0 else []  # 确保为空数组

# 保存为 scientist_data.json
scientist_data.to_json('scientist_data.json', orient='records', force_ascii=False)
print("scientist_data.json 已生成！")

scientist_data.json 已生成！


In [133]:
timeline_data = data.groupby(['Laureate name', 'field', 'Pub year']).size().reset_index(name='times')

# 保存为 scientist_timeline.json
timeline_data.to_json('scientist_timeline.json', orient='records', force_ascii=False)

In [160]:
data[(data["Is prize-winning paper"] == "YES") & (data["Laureate name"] == "schekman, r")].sort_values("Pub year").head(20)

Unnamed: 0,Laureate ID,Laureate name,Prize year,Title,Pub year,Paper ID,DOI,Journal,Affiliation,Is prize-winning paper,field
2165,30009,"schekman, r",2013,secretion and cell surface growth are blocked ...,1979.0,1966447000.0,10.1073/pnas.76.4.1858,proceedings of the national academy of science...,,YES,medicine
2164,30009,"schekman, r",2013,identification of 23 complementation groups re...,1980.0,2059775000.0,10.1016/0092-8674(80)90128-2,cell,,YES,medicine


In [145]:
data

Unnamed: 0,Laureate ID,Laureate name,Prize year,Title,Pub year,Paper ID,DOI,Journal,Affiliation,Is prize-winning paper,field
0,20001,"stoddart, j",2016,a molecular shuttle,1991.0,1.976039e+09,10.1021/ja00013a096,journal of the american chemical society,northwestern university,YES,chemistry
1,20001,"stoddart, j",2016,chemical synthesis of nanostructures,1993.0,1.963538e+09,10.1557/PROC-330-57,mrs proceedings,northwestern university,NO,chemistry
2,20001,"stoddart, j",2016,formation and x ray crystal structure of pt h2...,1981.0,1.963552e+09,10.1039/C39810000851,journal of the chemical society chemical commu...,northwestern university,NO,chemistry
3,20001,"stoddart, j",2016,single walled carbon nanotubes under the influ...,2005.0,2.095637e+09,10.1002/smll.200400070,small,northwestern university,NO,chemistry
4,20001,"stoddart, j",2016,synthesis of medium heterocyclic rings from 6 ...,1974.0,2.095679e+09,10.1016/S0008-6215(00)82105-9,carbohydrate research,northwestern university,NO,chemistry
...,...,...,...,...,...,...,...,...,...,...,...
21499,10193,"zeeman, p",1902,The Effect of Magnetisation on the Nature of L...,1897.0,,,,,YES,physics
21500,10193,"zeeman, p",1902,The Influence of a Magnetic Field on Radiation...,1896.0,,,,,YES,physics
21501,10193,"zeeman, p",1902,zur hyperfeinstruktur des wismuts,1930.0,2.058955e+09,10.1007/BF01397521,european physical journal,,NO,physics
21502,10193,"zeeman, p",1902,on the spectrum of ionised potassium in connex...,1924.0,2.013742e+09,10.1038/114352a0,nature,university of amsterdam,NO,physics
