# Get and Clean the Data
***

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from random import randint
import re
from urllib.request import urlopen
import time

data_dir = "./data/"

## Scrape from PeriodicTable.com

In [None]:
# get the isotopes link page
url = "https://periodictable.com/Properties/A/KnownIsotopes.html"
html = urlopen(url)
soup = BeautifulSoup(html,'lxml')

df = pd.DataFrame(columns=["element","half_life","protons","neutrons","boson","spin","parity","atomic_weight",
                           "abundance","mass_excess","binding_energy","magnetic_moment","quadrupole_moment"])

element = ""

# loop through isotope links
for a in soup.find("table", width=748).find_all("a")[:507]:
    # element link
    if not a.text[0].isdigit():
        element = a.text
        time.sleep(1)
    # isotope link
    else:
        # dont make them angry
        time.sleep(randint(2,5))
        
        # get the soup
        url = "https://periodictable.com/" + a["href"][6:]
        html = urlopen(url)
        soup = BeautifulSoup(html,'lxml')
        
        # get data table
        table = soup.find_all("table", width=726)[1]
        
        # get half life
        half_life = table.find_all("td", valign="top", width=130)[1].text
        
        # parse column 2
        col2 = table.find_all("td", valign="top", width=200)
        fb = col2[0].text.split(',')[0]
        
        pn = col2[0].text.split()
        prot = pn[1][:-1]
        neut = pn[2][:-1]
        
        sp = col2[1].text.split()
        spin = sp[1]
        parity = sp[3]
                
        # parse column 3
        col3 = table.find_all("td", align="left")
        
        # delete junk
        while not col3[0].text[0].isdigit() and col3[0].text != "N/A":
            del col3[0]
            
        weight = col3[0].text
        abund = col3[1].text
        massex = col3[2].text
        binden = col3[3].text
        mgnmom = col3[4].text
        quadmom = col3[5].text
            
        
        # add row
        df.loc[len(df.index)] = [element,half_life,prot,neut,fb,spin,parity,weight,abund,massex,binden,mgnmom,quadmom]

# save raw data
df.to_csv("isotopes_raw.csv", index=False)

## Clean the Data

In [None]:
# convert half lifes to seconds
def convertHalfLife(x):
    # conversion factors
    factors = {"ns": 1/(365*24*60*60*1e9), 
               "µs": 1/(365*24*60*60*1e6), 
               "ms": 1/(365*24*60*60*1e3), 
               "s":  1/(365*24*60*60), 
               "m":  1/(365*24*60), 
               "h":  1/(365*24), 
               "d":  1/365, 
               "y":  1}
    
    if x == "Stable":
        return 1e24
    
    if pd.isnull(x):
        return x
    
    else:
        # [number, units]
        t = x.split()
        
        # scientific notation
        sci = t[0].split('×')
        if len(sci) > 1:
            t[0] = float(sci[0]) * 10**float(sci[1][2:]) * factors[t[1]]
        else:
            t[0] = float(t[0]) * factors[t[1]]
        return t[0]
    
# convet half life to binary, 1 if x greater than thresh
def halflifeToBinary(x,thresh):
    if x == np.inf:
        return 1
    elif float(x) >= thresh:
        return 1
    else:
        return 0
    
def spinToFloat(x):
    if x == "Uncertain" or x == "?":
        return np.nan
    else:
        frac = x.split('/')
        if len(frac) == 1:
            return float(x)
        else:
            return float(frac[0]) / float(frac[1])
        
def parityToInt(x):
    if x == "Uncertain" or x == "?":
        return np.nan
    else:
        return int(x)
    
def abundanceToFloat(x):
    if x == "None":
        return 0
    elif x == "1×102%":
        return 1
    else:
        # % between 0 and 1
        return float(x[:-1]) / 100
    
def magneticToFloat(x):
    if pd.isnull(x) or x == "Uncertain":
        return np.nan
    else:
        return float(x[:-2])
    
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
# get data
df = pd.read_csv("./data/isotopes_raw.csv")

# remove rows with missing half life
df = df[pd.notnull(df["half_life"])]
df = df.reset_index(drop=True)

# convert half life to seconds
df["half_life"] = df["half_life"].apply(convertHalfLife)

# to numeric
df["protons"] = pd.to_numeric(df["protons"])
df["neutrons"] = pd.to_numeric(df["neutrons"])

# to binary
df["boson"] = df["boson"].map({"Boson": 1, "Fermion": 0})

# to numeric
df["spin"] = df["spin"].apply(spinToFloat)
df["parity"] = df["parity"].apply(parityToInt)
df["abundance"] = df["abundance"].apply(abundanceToFloat)

# in MeV
df["mass_excess"] = df["mass_excess"].map(lambda x: float(x[:-3]))
df["binding_energy"] = df["binding_energy"].map(lambda x: float(x[:-3]))

# in μ
df["magnetic_moment"] = df["magnetic_moment"].apply(magneticToFloat)

# to binary
df["magnetic_bi"] = np.where(pd.isnull(df["magnetic_moment"]) | (df["magnetic_moment"] == 0), 0, 1)
df["quadrupole_bi"] = np.where(pd.isnull(df["quadrupole_moment"]), 0, 1)

# binary target variables
df["stable"] = df["half_life"].apply(halflifeToBinary, thresh=1e24)
df["stable_day"] = df["half_life"].apply(halflifeToBinary, thresh=1/365)
df["stable_year"] = df["half_life"].apply(halflifeToBinary, thresh=1)

df.to_csv("./data/isotopes_clean.csv", index=False)

## Nuclear Moment Data

In [None]:
import tabula

pages2001 = list(range(10,152)
pages2014 = list(range(15,170)

tabula.convert_into(data_dir + "nuclear_moments2014.pdf", 
                    data_dir + "nuclear_moments2014.csv", 
                    output_format="csv", pages=pages2014)
)

In [2]:
df = pd.read_csv(data_dir + "nuclear_moments2014.csv")
df = df.dropna(how="all").reset_index(drop=True)
df[["Nucleus","Ex","T1/2"]] = df[["Nucleus","Ex","T1/2"]].fillna(method="ffill")
df

  df[["Nucleus","Ex","T1/2"]] = df[["Nucleus","Ex","T1/2"]].fillna(method="ffill")


Unnamed: 0,Nucleus,Ex,T1/2,I,μ(nm),Q(b),Unnamed: 6,[Ref. Std.],Method,NSR Keynumber,Journal Reference
0,0 n 1,0,10.6 m,1/2+,-1.9130427(5) d,,,,"N,R",2000Mo36,RMP 72 351 (00)
1,1 H 1,0,stable,1/2+,+2.79284734(3) d,,,,"M/N,R",2000Mo36,RMP 72 351 (00)
2,antiproton,0,-,1/2+,-2.7862(83),,,,HFS,2011Fr10,HFI 199 337 (11)
3,1 H 2,0,stable,1+,+0.857438228(9) d,,,[1H],"N,R",2000Mo36,RMP 72 351 (00)
4,1 H 2,0,stable,,+0.857438240(12) d,,,[1H],N,2005KA25,Can.J.Phys. 83 405 (05)
...,...,...,...,...,...,...,...,...,...,...,...
5559,99 Es 253,0,20.4 d,7/2+,+4.10(7),,,,AB/D,1975Go05,PR A11 499 (75)
5560,99 Es 253,0,20.4 d,,,6.7(8),R,,AB,1975Go05,PR A11 499 (75)
5561,99 Es 254,0,276 d,(7+),4.4(4),,,[253Es],NO,2009Se09,PR C79 064322 (09)
5562,99 Es 254,78,39.3 h,2+,2.90(7),,,[253Es],AB,1975Go05,PR A11 499 (75)


## Data

In [3]:
df.head(20)

Unnamed: 0,Nucleus,Ex,T1/2,I,μ(nm),Q(b),Unnamed: 6,[Ref. Std.],Method,NSR Keynumber,Journal Reference
0,0 n 1,0,10.6 m,1/2+,-1.9130427(5) d,,,,"N,R",2000Mo36,RMP 72 351 (00)
1,1 H 1,0,stable,1/2+,+2.79284734(3) d,,,,"M/N,R",2000Mo36,RMP 72 351 (00)
2,antiproton,0,-,1/2+,-2.7862(83),,,,HFS,2011Fr10,HFI 199 337 (11)
3,1 H 2,0,stable,1+,+0.857438228(9) d,,,[1H],"N,R",2000Mo36,RMP 72 351 (00)
4,1 H 2,0,stable,,+0.857438240(12) d,,,[1H],N,2005KA25,Can.J.Phys. 83 405 (05)
5,1 H 2,0,stable,,,+0.00286(2),R,,"MB,R",1979Bi14,PR A20 381 (79)
6,1 H 2,0,stable,,,0.0028(2),,,CIAN,1985Ka05,NP A435 502 (85)
7,1 H 3,0,12.33 y,1/2+,+2.97896244(4),,,[1H],"N,R",1977Ne16,ZETF 72 1659 (77)
8,2 He 3,0,stable,1/2+,-2.12749772(3),,,[1H],"N,R",2000Mo36,RMP 72 351 (00)
9,3 Li 6,0,stable,1+,+0.8220473(6),,,,AB/D,1974Be50,ZP 270 173 (74)


In [None]:
df.info()