In [3]:
import numpy as np
import pandas as pd
import requests

from construct_dataset import get_publisher_metadata_parallel

In [4]:
df = pd.read_excel("data/aluminum/AL_CMU_benchmark.xlsx")
df.shape

(330, 67)

In [5]:
df = df[df["Pfeiffer_no"].notnull()]
df

Unnamed: 0,Ref,double-checked,name,AA,temper,size,dir,YS [MPa],UTS [Mpa],elong [%],...,Sr,V,Yb,Other properties / data,Microstructure [None / OM / SEM / EBSD / TEM / APT],doi,Pfeiffer_no,series,who added,Notes
0,Kolahgar2016,True,1.5,1100,H18,1.5,,180.2,191.3,4.8,...,,0.01,,,,10.1007/s11661-016-3375-0,4,1000,,
1,Kolahgar2016,True,2,1100,H18,2,,177.2,188.3,5.6,...,,0.01,,,,10.1007/s11661-016-3375-0,5,1000,,
2,Kolahgar2016,True,2.5,1100,H18,2.5,,165.3,175.3,6.2,...,,0.01,,,,10.1007/s11661-016-3375-0,6,1000,,
3,Abdulstaar2013,True,As-received,1050,,,,20.0,72.0,120.0,...,,0.01,,,EBSD,10.1016/j.msea.2012.12.046,7,1000,,
4,Fu2014,True,2A97,,T4,,,,443.5,18.5,...,,,,,"OM, XRD",10.1016/j.msea.2014.08.038,34,2000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,Khan2017,True,AA7475-T761,7475,T761,,,430.0,468.0,13.0,...,,,,,,10.1016/j.jallcom.2016.11.389,1264,7000,,
326,Chakherlou2009,True,AL 7075-T6,7075,T6,,,503.0,600.0,,...,,,,,,10.1016/j.engfailanal.2008.03.002,1269,7000,,
327,Singh2004,True,0.22Sc,8090,T8,50mm,LT,397.0,460.0,5.4,...,,,,,,10.1016/j.scriptamat.2003.12.001,1274,8000,,
328,Singh2004,True,0.11Sc,8090,T8,50mm,LT,399.0,482.0,7.7,...,,,,,,10.1016/j.scriptamat.2003.12.001,1275,8000,,


In [6]:
len(df[df["double-checked"]==1.0]["doi"].unique())

154

In [7]:
meta_df = get_publisher_metadata_parallel(df["doi"].unique())

100%|██████████| 154/154 [00:00<00:00, 3135547.65it/s]


In [8]:
meta_df["publisher"].value_counts()

publisher
Elsevier BV                                132
Springer Science and Business Media LLC     22
Name: count, dtype: int64

In [9]:
joint_al_df = df.merge(meta_df, on="doi")

In [10]:
joint_al_df.to_csv("data/aluminum/al_data.csv", index=False)

In [11]:
joint_al_df[joint_al_df["publisher"] == "Springer Science and Business Media LLC"].shape

(35, 73)

In [12]:
joint_al_df.notnull().sum()

Ref                    328
double-checked         330
name                   323
AA                     226
temper                 251
                      ... 
journal                330
publisher              330
included_in_dataset    330
pdf                    330
xml                    330
Length: 73, dtype: int64

In [13]:
joint_al_df["Hardness location"].fillna("missing").value_counts()

Hardness location
missing    240
table       53
plot        29
text         8
Name: count, dtype: int64

In [14]:
joint_al_df["Has HT [True / False]"].value_counts()

Has HT [True / False]
True     213
False    117
Name: count, dtype: int64

In [15]:
joint_al_df["Has comp [True / False / nominal]"].value_counts()

Has comp [True / False / nominal]
True       256
nominal     36
False       35
Nominal      3
Name: count, dtype: int64

In [16]:
meta_df = pd.read_csv("data/aluminum/publisher_metadata.csv")

In [17]:
rng = np.random.default_rng(8675309)

xml_train_dois = rng.choice(meta_df[meta_df["xml"]]["doi"].unique(), 3)
pdf_train_dois = rng.choice(meta_df[meta_df["pdf"]]["doi"].unique(), 3)

In [18]:
xml_train_dois

array(['10.1016/j.scriptamat.2004.07.020',
       '10.1016/j.engfailanal.2010.08.007',
       '10.1016/j.jallcom.2013.08.214'], dtype=object)

In [19]:
pdf_train_dois

array(['10.1007/s11661-010-0395-z', '10.1007/s11661-008-9739-3',
       '10.1007/s11837-016-1896-z'], dtype=object)

In [20]:
meta_df["doi"].unique()

array(['10.1007/s11661-016-3375-0', '10.1016/j.msea.2012.12.046',
       '10.1016/j.msea.2014.08.038', '10.1016/j.jmatprotec.2017.04.024',
       '10.1007/s11661-013-1778-8', '10.1007/s11661-016-3807-x',
       '10.1016/j.msea.2008.03.017', '10.1016/j.matlet.2013.08.093',
       '10.1016/j.jmatprotec.2008.04.020', '10.1007/s11663-016-0611-3',
       '10.1016/j.msea.2016.12.087', '10.1007/s11661-010-0377-1',
       '10.1007/s10853-009-3735-x', '10.1016/j.matdes.2015.08.003',
       '10.1007/s11661-014-2207-3', '10.1016/j.matdes.2015.06.044',
       '10.1016/j.msea.2009.04.049', '10.1007/s11661-008-9739-3',
       '10.1016/j.msea.2011.08.015', '10.1016/j.msea.2012.07.064',
       '10.1016/j.msea.2010.12.045', '10.1016/j.scriptamat.2011.12.011',
       '10.1016/j.jmatprotec.2007.08.070',
       '10.1016/j.scriptamat.2006.04.013', '10.1016/j.matdes.2007.11.005',
       '10.1016/j.msea.2006.04.087', '10.1007/s11661-016-3332-y',
       '10.1016/j.matdes.2016.08.004', '10.1016/s0921-5093(00)0

In [23]:
(meta_df["pdf"] | meta_df["xml"]).sum()

np.int64(152)

In [24]:
meta_df["pdf"].sum()

np.int64(22)

In [25]:
meta_df["xml"].sum()

np.int64(151)

In [26]:
(meta_df["pdf"] & meta_df["xml"]).sum()

np.int64(21)

In [27]:
joint_al_df.shape

(330, 73)

In [29]:
from data.aluminum.constraints import columns_to_predict

joint_al_df[columns_to_predict].notnull().sum().sum()

np.int64(3806)

In [31]:
(joint_al_df[columns_to_predict].shape[0] * joint_al_df[columns_to_predict].shape[1])

12210