This first notebook will focus on scraping the web data (using BeautifulSoup) and formatting it as a csv for later use. 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url = "http://iopscience.iop.org/article/10.1088/1367-2630/18/1/013003/meta"
resp = requests.get(url)
html = resp.text
soup = BeautifulSoup(html, "lxml")

In [3]:
table_1 = soup.find_all('table')[0] # This pulls the first table from the xml code
df_1 = pd.read_html(str(table_1))
df_1 = pd.concat(df_1)
df_1 = df_1.rename(columns= {"Unnamed: 0":"Molecule", "Unnamed: 1":"U Level", "Unnamed: 2":"Calc Type"})
    # renames the empty headers
    
# Copes the type of molecule to all the other entries
df_1.iloc[0:int(len(df_1)/2),0] = 'MPc'
df_1.iloc[int(len(df_1)/2):int(len(df_1)),0] = 'F16MPc'

# Drops the string 'U = *" in favor of the integer U value for later
for n in range(len(df_1)):
    if df_1.iloc[n,1] is np.nan:
        df_1.iloc[n,1] = df_1.iloc[n-1,1]
df_1.iloc[:,1] = df_1.iloc[:,1].str.replace('U\xa0=\xa0*','')
df_1.iloc[:,1] = pd.to_numeric(df_1.iloc[:,1])

# Write to CSV
df_1.to_csv('magnetic_moment_unformatted.csv')

df_1

Unnamed: 0,Molecule,U Level,Calc Type,Sc,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ag
0,MPc,0,total,0.8,2.9,2.1,2.3,5.3,5.9,1.6,1.5,2.2,0.4,1.6
1,MPc,0,metal,0.1,0.8,2.9,4.0,4.6,4.1,1.3,0.1,0.5,0.0,0.2
2,MPc,4,total,0.8,3.9,3.2,3.7,5.3,5.9,4.6,3.2,1.8,0.4,1.9
3,MPc,4,metal,0.0,1.1,3.0,4.2,4.8,4.2,2.7,1.6,0.5,0.0,0.2
4,MPc,8,total,0.8,3.8,3.2,4.1,5.3,5.8,4.4,2.9,1.6,0.4,2.1
5,MPc,8,metal,0.0,1.1,3.0,4.2,4.9,4.6,2.8,1.7,0.5,0.0,0.2
6,F16MPc,0,total,0.8,1.6,1.2,4.0,4.7,4.1,1.0,0.0,1.1,0.0,0.7
7,F16MPc,0,metal,0.0,1.5,2.4,4.0,4.7,4.1,1.1,0.0,0.5,0.0,0.2
8,F16MPc,4,total,0.8,1.5,3.2,4.0,4.9,3.4,2.9,1.9,1.1,0.0,0.8
9,F16MPc,4,metal,0.0,1.5,2.6,4.1,5.0,4.2,2.6,1.5,0.5,0.0,0.3


In [None]:
# Second data table, but won't end up using in this analysis.
table_2 = soup.find_all('table')[1]
df_2 = pd.read_html(str(table_2))
df_2 = pd.concat(df_2)
df_2 = df_2.rename(columns= {"Unnamed: 0":"Molecule", "Unnamed: 1":"U Level"})

df_2.iloc[0:int(len(df_2)/2),0] = 'MPc'
df_2.iloc[int(len(df_2)/2):int(len(df_2)),0] = 'F16MPc'

df_2.iloc[:,1] = df_2.iloc[:,1].str.replace('U\xa0=\xa0*','')
df_2.iloc[:,1] = pd.to_numeric(df_2.iloc[:,1])

df_2

As discussed in the report, I'm feature engineering the electron occupancy rather than leaving the metals as an ordinal class

In [4]:
d_spin = {
    "Sc":0,
    "Ti":0.5,
    "V":0,
    "Cr":1,
    "Mn":1.5,
    "Fe":1.0,
    "Co":0.5,
    "Ni":0,
    "Cu":0.5,
    "Zn":0,
    "Ag":0.5
}

# the actual spin state of each metal assuming a +2 oxidation state
# filling according to energy levels given from a D4h symmeteric molecule

In [5]:
d_occupancy = {
    "Sc":1,
    "Ti":2,
    "V":3,
    "Cr":5,
    "Mn":5,
    "Fe":6,
    "Co":7,
    "Ni":8,
    "Cu":9,
    "Zn":10,
    "Ag":9
}
# Note this does not account for oxidation state, just the number of electrons in the d orbital

In [14]:
magnetic_data = pd.DataFrame()
df = []


for metal in d_occupancy:
    df_d_electrons = pd.DataFrame({'Total D Electrons': [1]*len(df_1)})
    #df_d_electrons['Total D Electrons'] = df_d_electrons['Total D Electrons'].apply(lambda x: x*d_occupancy.get(metal))


In [15]:
df_d_electrons

Unnamed: 0,Total D Electrons
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [None]:
magnetic_data = pd.DataFrame()
df = []

# Need to reformat the data table so the magnetic moments are the target value
for metal in d_occupancy:

    # make the column with the total number of d electrons
    df_d_electrons = pd.DataFrame({'Total D Electrons': [1]*len(df_1)})
    df_d_electrons['Total D Electrons'] = df_d_electrons['Total D Electrons'].apply(lambda x: x*d_occupancy.get(metal))


    # make the column with the spin state of the d electron
    df_d_spin = pd.DataFrame({'D Spin State': [1]*len(df_1)})
    df_d_spin['D Spin State'] = df_d_spin['D Spin State'].apply(lambda x: x*d_spin.get(metal))
    
    # make the small set with just one metal
    df = pd.concat([df_1.iloc[:,0:3], df_1.loc[:,metal]], axis = 1)

    df = pd.concat([df, df_d_electrons, df_d_spin], axis = 1)
    df = df.rename(index=str, columns ={metal: "Magnetic Moment"})
    magnetic_data = pd.concat([magnetic_data, df], ignore_index = True)

In [None]:
# Class labels left
magnetic_data.to_csv('magnetic_moment_formatted.csv')

%store magnetic_data
magnetic_data

In [None]:
# One Hot Encoded for Regression Analysis Later
magnetic_data_ohe = pd.get_dummies(magnetic_data)
magnetic_data_ohe.to_csv('magnetic_moment_ohe_formatted.csv')

%store magnetic_data_ohe
magnetic_data_ohe