# Obtaining DOS data from DOSCAR

This notebook contains programs which parse the DOSCAR file from VASP's output line-by-line to extract both the total electronic density of states (DOS) and the orbital-projected density of states (PDOS). Additionally, the POSCAR file is also parsed in order to extract the element types and the number of each element in the calculation. An example DOS plot is provided for the carbon dioxide solid phase V at high pressure.

In [1]:
import re
import pandas as pd
import numpy as np
import time
import plotly.express as px
import plotly.graph_objects as go

In [2]:
"""
Function: _parse_line
    Do a regular expression search on a line of text in a file against all defined regexes and
    return the key and match result of the first matching regex
----------------
Parameters
    ------------
    line : str
        Single line that is read in from a file of interest
    rx_dict : dictionary
        Dictionary that defines a regular expression which will be searched for
        to provide a key and match.

"""

def _parse_line(line, rx_dict):
    

    # Search the line for an occurence of the regular expression defined by 'rx_dict'
    for key, rx in rx_dict.items():
        
        match = rx.search(line)
        
        if match:
            return key, match
    
    # if there are no matches
    return None, None



In [None]:
"""
Define dictionary of regular expression patterns to fine atomic species, and number of atoms of each.

This dictionary is used as a regex matcher in the 'get_ionnumber' function below.

In the POSCAR file format, two different lines contain the atom species and number of each species.
"""

atomic_dict = {'species': re.compile(r'([A-Z])\s+([A-Z])?\s+([A-Z])?'),
            'numbers': re.compile(r'(\d+)\s+(\d+)?\s+(\d+)')}

In [3]:
"""
Function: get_ionnumber
    Reads POSCAR file and returns the number of ions of each atomic species in the file.

Parameters
    ----------
    filepath : str
        Filepath for file_object to be parsed. **Must be POSCAR file or have POSCAR format**

Returns
    ----------
    atoms: list
        List of atoms in the system designated by their atomic symbol - i.e. ['C', 'O',...]
    atom_numbers: list
        The corresponding number of each atomic specie found in the 'atoms' list.
"""

def get_ionnumber(filepath):
    
    # Boolean variables that become True when the atomic species and number of atoms of each species
    # are obtained from the POSCAR file.
    species_grabbed = False
    numbers_grabbed = False
    with open(filepath, 'r') as file_object:
        
        # Skip first line so that the 'atomic_dict' regex does not return unwanted expressions.
        line = file_object.readline()
        line = file_object.readline()
        
        while numbers_grabbed == False:
            
            key, match = _parse_line(line, atomic_dict)
            
            # Store element/atom types.
            if key == 'species':
                atoms = match.group(0)
                atoms = atoms.split()
                #atom_species.append(match.group(0))
                species_grabbed = True
                #print(match)
            
            # Next line after atom type is number of each type. Store number of each atom type.
            if species_grabbed == True:
                line = file_object.readline()
                key, match = _parse_line(line,atomic_dict)
                if key == 'numbers':
                    print(match)
                    atom_numbers = match.group(0)
                    atom_numbers = atom_numbers.split()
                    atom_numbers = list(map(int, atom_numbers))
                    numbers_grabbed = True
            
            line = file_object.readline()
    
    
    return atoms, atom_numbers

In [None]:
"""
Define dictionary of regular expressions and relevant global variables. 

This dictionary is intended for parsing DOS data from the DOSCAR output file.
"""

# Regex dictionary for the total DOS data.
#'calc_info' identifies separate groups for EMAX, EMIN, NEDOS, and EFERMI
#'DOS_info' identifies separate groups for energy, DOS, and integrated DOS (IDOS)
tot_dict = {
    'calc_info': re.compile(r'(\d+\.\d+)\s+(-?\d+\.\d+)\s+(\d+)\s+(-?\d+\.\d+)\s+(1\.0+)'),
    'DOS_info': re.compile(r'(-?\d+\.\d+)\s+(\d+\.\d+E[+-]\d{2})\s+(\d+\.\d+E[+-]\d{2})')
}

# Regex dictionary for the orbital-projected DOS data.
#'pDOS_info' identifies separate groups for each lm-decomposed orbital (s, py, pz, etc.)
part_dict = {
    'pDOS_info': re.compile(r'(-?\d+\.\d+)\s+(\d+\.\d+E[+-]\d+)\s+(\d+\.\d+E[+-]\d+)\s+(\d+\.\d+E[+-]\d+)\s+(\d+\.\d+E[+-]\d+)\s+(\d+\.\d+E[+-]\d+)\s+(\d+\.\d+E[+-]\d+)\s+(\d+\.\d+E[+-]\d+)\s+(\d+\.\d+E[+-]\d+)\s+(\d+\.\d+E[+-]\d+)')
    
}

In [16]:
"""
Function: parse_DOSCAR
    This function parses through a DOSCAR file obtained from a VASP calculation, and returns the total DOS and the 
    orbital-projected DOS data.

Parameters
    ----------
    filepath : str
        Filepath for file_object to be parsed

Returns
    ----------
    tot_data: pandas DataFrame
        Dataframe containing the total DOS data: energy, DOS, and integrated (IDOS)
        
    ion_pdos: pandas DataFrame
        Dataframe containing the summed orbital-projected DOS data: 
        energy, and DOS projected onto each lm-decomposed orbital (s,py,pz,px,dxy,dyz,dz2-r2,dxz,dx2-y2,ptot)
        where ptot is the sum of contributions of py,pz,px.
        
    info_data: list
        List of size 4 containing the calculation information:
        maximum and minimum energy in DOS calculation (EMAX and EMIN),
        number of data points NEDOS in the energy range set by EMAX and EMIN
        and fermi Energy EFERMI.

"""
def parse_DOSCAR(filepath, tot_dict, part_dict):
    
    import numpy as np
    import pandas as pd
    
    # Initialize empty lists to store data
    tot_data = []
    calc_info_data = []
    pdos_data = []
    
    # Open DOS file with 'filepath' variable,
    # read file line-by-line to match against defined regular expressions.
    # Store and return matched data total DOS data into a dataframe 'tot_data', and the 
    # energy boundaries, fermi energy of system into list 'calc_info_data'.
    #
    with open(filepath, 'r') as file_object:
        
        line = file_object.readline()
        
        NEDOS_count = 0.
        NEDOS = -1.
        
        while line:
            
            # Total density of states data parsing
            while NEDOS_count != NEDOS:
                
                key, match = _parse_line(line, tot_dict)
                
                #print(NEDOS_count)
                if key == 'calc_info':
                
                    EMAX = float(match.group(1))
                    EMIN = float(match.group(2))
                    NEDOS = float(match.group(3))
                    EFERMI = float(match.group(4))
                
                    calc_info_data = [EMAX, EMIN, NEDOS, EFERMI]
                    
                if key == 'DOS_info':
                    #print(match.group(0))
                    
                    # Store matches into dictionary for easy transformation into DataFrame.
                    tot_row = {'energy': match.group(1),
                           'DOS': match.group(2),
                           'IDOS': match.group(3)
                        }
                    tot_data.append(tot_row)
                    NEDOS_count += 1.
            
                line = file_object.readline()
                
            #line = file_object.readline()
            
            # Projected density of states data parsing
             
            """
            Sums up contribution from each ion to the orbital-projected DOS.
            """
            # Call get_ionnumber to get ion symbols and number of each.
            anames, anumbers = get_ionnumber('POSCAR_CO2_10242022')
            
            # Store total number of ions in system in variable 'TOTNIONS'.
            TOTNIONS = 0
            for element in anumbers:
                TOTNIONS += element
            
            # Initialize empty dictionary which will contain ion type and PDOS data for each ion type
            ion_pdos = {}
            
            for i in range(0,len(anumbers)):
                
                # Initialize counting variable for the PDOS of each ion.
                ion_count = 1
                NIONS = anumbers[i]
                
                while ion_count < NIONS+1:

                    # Reset NEDOS_count variable to 0 for each ion.
                    NEDOS_count = 0

                    # Reset temporary PDOS list to refill with data for each ion.
                    temp_pdos = []

                    # Iterate over the full specified energy range NEDOS for each ion.
                    while NEDOS_count != NEDOS:

                        # Search for matches to PDOS regular expression specified in 'part_dict' variable.
                        key, match = _parse_line(line, part_dict)

                        if key == 'pDOS_info':

                            # Store matches into dictionary for easy transformation into DataFrame.
                            temp_pdos_row = {'energy': match.group(1),
                                       's': match.group(2),
                                       'py': match.group(3),
                                       'pz': match.group(4),
                                       'px': match.group(5),
                                       'dxy': match.group(6),
                                       'dyz': match.group(7),
                                       'dz2_r2': match.group(8),
                                       'dxz': match.group(9),
                                       'dx2_y2': match.group(10),
                                }
                            temp_pdos.append(temp_pdos_row)

                            # After matching and storing PDOS data from line, increment NEDOS counter.
                            NEDOS_count += 1.

                        line = file_object.readline()

                    # After reading the PDOS from each ion, store into a DataFrame 'pdos_data'
                    # If on the first ion count, create a DataFrame, otherwise append the existing one.
                    if ion_count == 1:
                        pdos_data = pd.DataFrame(temp_pdos, dtype=float)
                    else:
                        temp_pdos = pd.DataFrame(temp_pdos, dtype=float)
                        pdos_data = pdos_data + temp_pdos
                        pdos_data.energy = temp_pdos.energy

                    # Increment ion counter
                    ion_count += 1
                
                # At end of loop for one species, store the PDOS DataFrame for the species in dictionary element
                # 'ion_pdos'
                # Subtract the fermi energy from the energy data to shift to 0 when plotting
                pdos_data.energy = pdos_data.energy - EFERMI
                pdos_data['ptot'] = pdos_data.py+pdos_data.pz+pdos_data.px
                ion_pdos[anames[i]] = pdos_data
            
    tot_data = pd.DataFrame(tot_data, dtype=float)
    tot_data.energy = tot_data.energy - EFERMI
    
    """Commented out portion would give TOTAL PDOS for all ions without separating by atomic species."""
    #pdos_data.energy = pdos_data.energy - EFERMI
    #pdos_data['ptot'] = pdos_data.py+pdos_data.pz+pdos_data.px
    """"""""""""
    # 'calc_info_data' contains EMAX, EMIN, NEDOS, EFERMI information.
    calc_info_data = np.array(calc_info_data)
    
    return tot_data, ion_pdos, calc_info_data



In [19]:
"""
Run functions

"""

if __name__ == '__main__':
    start = time.time()
    filepath = 'DOSCAR_CO2_10242022'
    TDOS_data, PDOS_data, INFO_data = parse_DOSCAR(filepath, tot_dict, part_dict)
    end = time.time()
    print('Time for calculation: ', end - start, 'seconds\n')

<re.Match object; span=(3, 11), match='C    O \n'>
<re.Match object; span=(5, 12), match='4     8'>
Time for calculation:  0.26360344886779785 seconds



In [20]:
# Full DOS as well as p and s orbital projections

fig = go.Figure()

"""
Total DOS from all atomic species in system
"""
fig.add_trace(go.Scatter(
    x=TDOS_data.energy,
    y=TDOS_data.DOS,
    name='total',
    marker=dict(color='#4E5672')
))


"""
p and s orbital PDOS for C ions
"""
fig.add_trace(go.Scatter(
    x=PDOS_data['C'].energy,
    y=PDOS_data['C'].ptot,
    name='C p-states',
    #marker=dict(color='#4E5672')
))

fig.add_trace(go.Scatter(
    x=PDOS_data['C'].energy,
    y=PDOS_data['C'].s,
    name='C s-states',
    #marker=dict(color='#4E5672')
))

"""
p and s orbital PDOS for O ions
"""
fig.add_trace(go.Scatter(
    x=PDOS_data['O'].energy,
    y=PDOS_data['O'].ptot,
    name='O p-states',
    #marker=dict(color='#4E5672')
))

fig.add_trace(go.Scatter(
    x=PDOS_data['O'].energy,
    y=PDOS_data['O'].s,
    name='O s-states',
    #marker=dict(color='#4E5672')
))

fig.update_layout(
    template='ggplot2',
    title=r"DOS of CO<sub>2</sub>-V at 40 GPa",
    xaxis_title="Energy (eV)",
    yaxis_title="DOS (1/eV)",
    title_x=0.5,
    title_y=0.9,
    font=dict(
        family="Garamond",
        size=18,
    )
)

fig.update_layout(legend=dict(
    yanchor="top",
    y=1.0,
    xanchor="right",
    x=0.99,
    bgcolor='rgba(0,0,0,0)'
))

fig.show()