In [31]:
import numpy as np
import pandas as pd

from construct_dataset import get_publisher_metadata_parallel, INCLUDED_PUBLISHERS
from data.zeolite.constraints import train_set_dois


## Initial EDA

Starting with EDA of the original dataset, and getting publisher statistics for it.

In [32]:
df = pd.read_csv("data/zeolite/ge_synthesis_data.csv", encoding="latin")

In [3]:
df["From?"].value_counts()

From?
Supp     795
Table    603
Text     240
Name: count, dtype: int64

In [4]:
publisher_meta = get_publisher_metadata_parallel(df[df["From?"].isin(["Table", "Text"])]["doi"].unique())

100%|██████████| 104/104 [00:00<00:00, 2536090.79it/s]


In [5]:
publisher_meta["publisher"].value_counts()

publisher
American Chemical Society (ACS)            29
Elsevier BV                                29
Royal Society of Chemistry (RSC)           20
Wiley                                      16
Springer Science and Business Media LLC    10
Name: count, dtype: int64

In [6]:

joined_df = df.merge(publisher_meta, on="doi")
joined_df

Unnamed: 0.1,Unnamed: 0,doi,Si,Ge,Al,OH,H2O,HF,SDA,B,...,FD1,MR1,FD2,MR2,article_type,journal,publisher,included_in_dataset,pdf,xml
0,0,10.1021/acs.cgd.8b00078,1,1.499000,0.0,0.000,19.99200,1.250,1.250,0.0,...,,,,,journal_article,Crystal Growth & Design,American Chemical Society (ACS),False,False,False
1,1,10.1021/acs.cgd.8b00078,1,0.667000,0.0,0.000,13.33600,0.834,0.834,0.0,...,18.0,10.0,,,journal_article,Crystal Growth & Design,American Chemical Society (ACS),False,False,False
2,2,10.1021/acs.cgd.8b00078,1,0.333000,0.0,0.000,10.66400,0.666,0.666,0.0,...,18.0,10.0,,,journal_article,Crystal Growth & Design,American Chemical Society (ACS),False,False,False
3,3,10.1021/acs.cgd.8b00078,1,0.250000,0.0,0.000,10.00000,0.625,0.625,0.0,...,18.0,10.0,,,journal_article,Crystal Growth & Design,American Chemical Society (ACS),False,False,False
4,4,10.1021/acs.cgd.8b00078,1,0.111000,0.0,0.000,8.88800,0.555,0.555,0.0,...,19.7,10.0,,,journal_article,Crystal Growth & Design,American Chemical Society (ACS),False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028,1633,10.1039/c6dt04688f,1,50.000000,0.0,0.750,153.00000,0.750,0.750,0.0,...,15.1,12.0,16.7,10.0,journal_article,Dalton Transactions,Royal Society of Chemistry (RSC),False,False,False
1029,1634,10.1039/c6dt04688f,1,50.000000,0.0,0.750,153.00000,0.750,0.750,0.0,...,15.1,12.0,16.7,10.0,journal_article,Dalton Transactions,Royal Society of Chemistry (RSC),False,False,False
1030,1635,10.1039/c6dt04688f,1,50.000000,0.0,0.750,153.00000,0.750,0.750,0.0,...,,,,,journal_article,Dalton Transactions,Royal Society of Chemistry (RSC),False,False,False
1031,1636,10.1039/c7ta02344h,1,0.500000,0.0,0.758,5.30303,0.000,0.379,0.0,...,16.6,12.0,,,journal_article,Journal of Materials Chemistry A,Royal Society of Chemistry (RSC),False,False,False


In [7]:
joined_df[(joined_df["publisher"].isin(INCLUDED_PUBLISHERS)) & joined_df["From?"].isin(["Table", "Text"])]

Unnamed: 0.1,Unnamed: 0,doi,Si,Ge,Al,OH,H2O,HF,SDA,B,...,FD1,MR1,FD2,MR2,article_type,journal,publisher,included_in_dataset,pdf,xml
45,45,10.1016/j.micromeso.2017.03.033,1,0.05,0.020000,0.125,0.000000,0.00,0.250,0.0,...,,,,,journal_article,Microporous and Mesoporous Materials,Elsevier BV,True,False,False
46,46,10.1016/j.micromeso.2017.03.033,1,0.05,0.020000,0.125,0.000000,0.16,0.250,0.0,...,17.4,10.0,17.1,10.0,journal_article,Microporous and Mesoporous Materials,Elsevier BV,True,False,False
47,47,10.1016/j.micromeso.2017.03.033,1,0.05,0.020000,0.125,0.000000,0.32,0.250,0.0,...,17.4,10.0,17.1,10.0,journal_article,Microporous and Mesoporous Materials,Elsevier BV,True,False,False
48,48,10.1016/j.micromeso.2017.03.033,1,0.05,0.020000,0.125,0.000000,0.56,0.250,0.0,...,17.4,10.0,,,journal_article,Microporous and Mesoporous Materials,Elsevier BV,True,False,False
49,49,10.1016/j.micromeso.2017.03.033,1,0.05,0.020000,0.125,0.000000,1.28,0.250,0.0,...,17.4,10.0,,,journal_article,Microporous and Mesoporous Materials,Elsevier BV,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
930,1535,10.1038/nature07957,1,1.00,0.000000,1.000,6.000000,1.00,0.500,0.0,...,10.5,30.0,,,journal_article,Nature,Springer Science and Business Media LLC,True,False,False
931,1536,10.1038/nature07957,1,1.00,0.020000,1.000,6.000000,1.00,0.500,0.0,...,10.5,30.0,,,journal_article,Nature,Springer Science and Business Media LLC,True,False,False
932,1537,10.1038/nchem.2761,1,0.50,0.000000,0.500,47.500000,0.00,0.500,0.0,...,15.6,14.0,,,journal_article,Nature Chemistry,Springer Science and Business Media LLC,True,False,False
933,1538,10.1038/nmat921,1,0.50,0.000000,0.756,5.303030,0.00,0.379,0.0,...,16.6,12.0,,,journal_article,Nature Materials,Springer Science and Business Media LLC,True,False,False


From the columns, deciding which should be predicted

In [8]:
joined_df.columns

Index(['Unnamed: 0', 'doi', 'Si', 'Ge', 'Al', 'OH', 'H2O', 'HF', 'SDA', 'B',
       'Time', 'Temp', 'SDA Type', 'SMILES', 'SDA_Vol', 'SDA_SA', 'SDA_KFI',
       'From?', 'Extracted', 'Zeo1', 'Zeo2', 'Dense1', 'Dense2', 'Am', 'Other',
       'ITQ', 'FD1', 'MR1', 'FD2', 'MR2', 'article_type', 'journal',
       'publisher', 'included_in_dataset', 'pdf', 'xml'],
      dtype='object')

In [9]:
meta_columns = ["doi", "From?", "article_type", "journal", "publisher"]
columns_to_predict = ['Si', 'Ge', 'Al', 'OH', 'H2O', 'HF', 'SDA', 'B', 'Time', 'Temp', 'SDA Type', "Extracted"]

In [10]:
final_df = joined_df[(joined_df["publisher"].isin(INCLUDED_PUBLISHERS)) & joined_df["From?"].isin(["Table", "Text"])][meta_columns + columns_to_predict]
final_df

Unnamed: 0,doi,From?,article_type,journal,publisher,Si,Ge,Al,OH,H2O,HF,SDA,B,Time,Temp,SDA Type,Extracted
45,10.1016/j.micromeso.2017.03.033,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.05,0.020000,0.125,0.000000,0.00,0.250,0.0,72.0,179.85,hexamethonium,Amorphous
46,10.1016/j.micromeso.2017.03.033,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.05,0.020000,0.125,0.000000,0.16,0.250,0.0,72.0,179.85,hexamethonium,ITQ-13+EU-1+amorphous
47,10.1016/j.micromeso.2017.03.033,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.05,0.020000,0.125,0.000000,0.32,0.250,0.0,72.0,179.85,hexamethonium,ITQ-13+EU-1
48,10.1016/j.micromeso.2017.03.033,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.05,0.020000,0.125,0.000000,0.56,0.250,0.0,72.0,179.85,hexamethonium,ITQ-13
49,10.1016/j.micromeso.2017.03.033,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.05,0.020000,0.125,0.000000,1.28,0.250,0.0,72.0,179.85,hexamethonium,ITQ-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
930,10.1038/nature07957,Text,journal_article,Nature,Springer Science and Business Media LLC,1,1.00,0.000000,1.000,6.000000,1.00,0.500,0.0,24.0,175.00,diquaternary ammonium,ITQ-37
931,10.1038/nature07957,Text,journal_article,Nature,Springer Science and Business Media LLC,1,1.00,0.020000,1.000,6.000000,1.00,0.500,0.0,24.0,175.00,diquaternary ammonium,ITQ-37
932,10.1038/nchem.2761,Text,journal_article,Nature Chemistry,Springer Science and Business Media LLC,1,0.50,0.000000,0.500,47.500000,0.00,0.500,0.0,312.0,175.00,"(6R,10S)-6,10-dimethyl-5-azoniaspiro[4,5]decan...",UTL
933,10.1038/nmat921,Text,journal_article,Nature Materials,Springer Science and Business Media LLC,1,0.50,0.000000,0.756,5.303030,0.00,0.379,0.0,168.0,175.00,1-methyl-4-phenylpyridinium hydroxide,ITQ-22


## Descriptive statistics for the dataset/sampling for training sets

In [11]:
final_df["publisher"].value_counts()

publisher
Elsevier BV                                350
Wiley                                       32
Springer Science and Business Media LLC     32
Name: count, dtype: int64

In [12]:
final_df["From?"].value_counts()

From?
Table    358
Text      56
Name: count, dtype: int64

In [13]:
len(final_df["doi"].unique())

55

In [14]:
existing_train_dois = ["10.1039/c7dt03751a", "10.1039/c5ce02312b", "10.1016/j.micromeso.2006.10.023"]
for doi in existing_train_dois:
    if doi in set(final_df["doi"].values):
        print(f"{doi} can be used")
    else:
        print(f"{doi} is missing from scraped set!")

10.1039/c7dt03751a is missing from scraped set!
10.1039/c5ce02312b is missing from scraped set!
10.1016/j.micromeso.2006.10.023 can be used


In [15]:
meta_df = pd.read_csv("data/zeolite/publisher_metadata.csv")

In [16]:
rng = np.random.default_rng(8675309)

xml_train_dois = rng.choice(meta_df[meta_df["xml"]]["doi"].unique(), 2)
pdf_train_dois = rng.choice(meta_df[meta_df["pdf"]]["doi"].unique(), 2)

print(xml_train_dois)
print(pdf_train_dois)

['10.1016/j.solidstatesciences.2007.08.002' '10.1007/s10934-015-0051-5']
['10.1002/anie.200461911' '10.1007/s11244-013-0170-7']


In [19]:
final_df[final_df["doi"].isin(train_set_dois)]

Unnamed: 0,doi,From?,article_type,journal,publisher,Si,Ge,Al,OH,H2O,HF,SDA,B,Time,Temp,SDA Type,Extracted
404,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.0,0.0,1.5,0.0,0.0,0.0,0.0,672.0,179.85,,SOD
405,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.0,0.0,1.5,0.2,0.0,0.0,0.0,504.0,179.85,,SOD
406,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.0,0.0,1.5,0.5,0.0,0.0,0.0,504.0,179.85,,SOD
407,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.0,0.0,1.5,1.0,0.0,0.0,0.0,504.0,179.85,,SOD
408,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.0,0.0,1.5,2.0,0.0,0.0,0.0,24.0,179.85,,Quartz
409,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.0,0.0,1.5,3.5,0.0,0.0,0.0,24.0,179.85,,Quartz
410,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.25,0.0,1.875,0.0,0.0,0.0,0.0,336.0,179.85,,SOD
411,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.493,0.0,2.239,0.0,0.0,0.0,0.0,168.0,179.85,,SOD
412,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,1.0,0.0,3.0,0.0,0.0,0.0,0.0,168.0,179.85,,SOD
413,10.1016/j.micromeso.2006.10.023,Table,journal_article,Microporous and Mesoporous Materials,Elsevier BV,1,0.493,0.0,2.239,5.224,0.0,0.0,0.0,24.0,179.85,,Quartz


In [20]:
meta_df[["pdf", "xml"]].sum()

pdf    22
xml    39
dtype: int64

In [22]:
(meta_df["pdf"] & meta_df["xml"]).sum()

np.int64(10)

In [25]:
final_df.shape

(414, 17)

In [26]:
from data.zeolite.constraints import columns_to_predict

In [29]:
final_df[columns_to_predict].notnull().sum().sum()

np.int64(4950)

In [30]:
414 * 12

4968