In [None]:
from mpcontribs.client import Client
import gzip, json, os
import numpy as np
from pandas import DataFrame
from collections import defaultdict
from tqdm.notebook import tqdm
from unflatten import unflatten
from pathlib import Path

name = 'carrier_transport'

### Configure project

In [None]:
description = """
Ab-initio electronic transport database for inorganic materials. Complex multivariable BoltzTraP
simulation data is condensed down into tabular form of two main motifs: average eigenvalues at set
moderate carrier concentrations and temperatures, and optimal values among all carrier concentrations
and temperatures within certain ranges. Here are reported the average of the eigenvalues of conductivity
effective mass (mₑᶜᵒⁿᵈ), the Seebeck coefficient (S), the conductivity (σ), the electronic thermal
conductivity (κₑ), and the Power Factor (PF) at a doping level of 10¹⁸ cm⁻³ and at a temperature of 300 K
for n- and p-type. Also, the maximum values for S, σ, PF, and the minimum value for κₑ chosen among the
temperatures [100, 1300] K, the doping levels [10¹⁶, 10²¹] cm⁻³, and doping types are reported. The
properties that depend on the relaxation time are reported divided by the constant value 10⁻¹⁴. The
average of the eigenvalues for all the properties at all the temperatures, doping levels, and doping
types are reported in the tables for each entry.
"""

legend = {
    "task": "Materials project task_id",
    "functional": "Type of DFT functional \
    (GGA: generalized gradient approximation, GGA+U: GGA + U approximation)",
    "metal": "If True, crystal is a metal",
    'ΔE': 'Band gap in eV',
    'V' : "Unit cell volume, in cubic angstrom",
    'mₑᶜ': 'Eigenvalues (ε₁, ε₂, ε₃) of the conductivity effective mass and their average (ε̄)',
    'S': 'Average eigenvalue of the Seebeck coefficient',
    'σ' : 'Average eigenvalue of the conductivity',
    'κₑ' : 'Average eigenvalue of the electrical thermal conductivity',
    'PF': 'Average eigenvalue of the Power Factor',
    'Sᵉ': 'Value (v), temperature (T), and doping level (c) at the \
    maximum of the average eigenvalue of the Seebeck coefficient', 
    'σᵉ': 'Value (v), temperature (T), and doping level (c) at the \
    maximum of the average eigenvalue of the conductivity',
    'κₑᵉ': 'Value (v), temperature (T), and doping level (c) at the \
    maximum of the average eigenvalue of the electrical thermal conductivity',
    'PFᵉ': 'Value (v), temperature (T), and doping level (c) at the \
    maximum of the average eigenvalue of the Power Factor',
}

references = [
    {"label": "SData", "url": "https://doi.org/10.1038/sdata.2017.85"},
    {"label": "Dryad", "url": "https://doi.org/10.5061/dryad.gn001"}
]

# with Client() as client:
#     client.projects.update_entry(pk=name, project={"other": None}).result() # ensure order
#     client.projects.update_entry(pk=name, project={
#         'description': description, 'other': legend, "references": references
#     }).result()
#     client.get_project(name).display()

In [None]:
eigs_keys = ['ε₁', 'ε₂', 'ε₃', 'ε̄']
prop_defs = {
    'mₑᶜ': "mₑ",
    'S': "µV/K",
    'σ': "1/fΩ/m/s",
    'κₑ': "GW/K/m/s",
    'PF': "GW/K²/m/s"
}
ext_defs = {"T": "K", "c": "µm⁻³"}
columns = {"task": None, "functional": None, "metal": None, "ΔE": "eV", "V": "Å³"}

for kk, unit in prop_defs.items():
    for k in ["p", "n"]:
        if kk.startswith("mₑ"):
            for e in eigs_keys:
                columns[f"{kk}.{k}.{e}"] = unit
        else:
            columns[f"{kk}.{k}"] = unit

for kk, unit in prop_defs.items():
    if kk.startswith("mₑ"):
        continue
        
    for k in ["p", "n"]:
        path = f"{kk}ᵉ.{k}"
        columns[f"{path}.v"] = unit

        for a, b in ext_defs.items():
            columns[f"{path}.{a}"] = b
         
        
columns["tables"] = None

# with Client() as client:
#     client.init_columns(name, columns)

### Prepare contributions

In [None]:
input_dir = '/project/projectdirs/matgen/fricci/transport_data/coarse'
# input_dir = '/Users/patrick/gitrepos/mp/mpcontribs-data/transport_coarse'
props_map = { # original units
    'cond_eff_mass': {"name": 'mₑᶜ', "unit": "mₑ"},
    'seebeck_doping': {"name": 'S', "unit": "µV/K"},
    'cond_doping': {"name": 'σ', "unit": "1/Ω/m/s"},
    'kappa_doping': {"name": 'κₑ', "unit": "W/K/m/s"},
}

In [None]:
files = [x for x in os.scandir(input_dir) if x.is_file()]
len(files)

In [None]:
contributions = []
total = len(files)
columns_name = "doping level [cm⁻³]"
title_prefix = "Temperature- and Doping-Level-Dependence"

titles = {
    'S': "Seebeck Coefficient",
    'σ': "Conductivity",
    'κₑ': "Electrical Thermal Conductivity",
    'PF': "Power Factor"
}

with Client() as client:
    identifiers = client.get_all_ids(dict(project=name)).get(name, {}).get("identifiers", [])
    
print("#contribs:", len(identifiers))

for obj in tqdm(files):
    identifier = obj.name.split('.', 1)[0].rsplit('_', 1)[-1]
    valid = bool(identifier.startswith('mp-') or identifier.startswith('mvc-'))

    if not valid:
        print(identifier, 'not valid')
        continue

    if identifier in identifiers:
        continue

    with gzip.open(obj.path, 'rb') as input_file:
        data = json.loads(input_file.read())
        task_type = 'GGA+U' if 'GGA+U' in data['gap'] else 'GGA'
        gap = data['gap'][task_type]

        cdata = {
            "task": data['task_id'][task_type],
            "functional": task_type,
            "metal": 'Yes' if gap < 0.1 else 'No',
            "ΔE": f"{gap} eV",
            "V": f"{data['volume']} Å³"
        }

        tables = [] 
        S2arr = []

        for doping_type in ['p', 'n']:

            for key, v in props_map.items():
                prop = data[task_type][key].get(doping_type, {})
                d = prop.get('300', {}).get('1e+18', {})
                unit = v["unit"]

                if d:
                    eigs = d if isinstance(d, list) else d['eigs']
                    k = f"{v['name']}.{doping_type}"
                    value = f"{np.mean(eigs)} {unit}"

                    if key == 'cond_eff_mass':
                        cdata[k] = {eigs_keys[-1]: value}
                        for neig, eig in enumerate(eigs):
                            cdata[k][eigs_keys[neig]] = f"{eig} {unit}"
                    else:
                        cdata[k] = value
                        if key == 'seebeck_doping':
                            S2 = np.dot(d['tensor'], d['tensor'])
                        elif key == 'cond_doping':
                            pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8
                            cdata[f"PF.{doping_type}"] = f"{pf} µW/cm/K²/s"

                if key != "cond_eff_mass":
                    prop_averages, dopings, cols = [], None, ['T [K]']
                    pf_averages = []
                    temps = sorted(map(int, prop.keys()))

                    for it, temp in enumerate(temps):
                        row = [temp]
                        row_pf = [temp]

                        if dopings is None:
                            dopings = sorted(map(float, prop[str(temp)].keys()))

                        for idop, doping in enumerate(dopings):
                            doping_str = f'{doping:.0e}'
                            if len(cols) <= len(dopings):
                                cols.append(f'{doping_str}'.replace("+", ""))

                            d = prop[str(temp)][doping_str]
                            row.append(np.mean(d["eigs"]))
                            tensor = d['tensor']

                            if key == 'seebeck_doping':
                                S2arr.append(np.dot(tensor, tensor))
                            elif key == 'cond_doping':     
                                S2idx = it * len(dopings) + idop
                                pf = np.mean(np.linalg.eigh(
                                    np.dot(S2arr[S2idx], tensor)
                                )[0]) * 1e-8
                                row_pf.append(pf)

                        prop_averages.append(row)
                        pf_averages.append(row_pf)

                    df_data = [np.array(prop_averages)]
                    if key == 'cond_doping':
                        df_data.append(np.array(pf_averages))

                    for ii, np_prop_averages in enumerate(df_data):
                        nm = "PF" if ii else v["name"]
                        u = "µW/cm/K²/s" if ii else unit

                        df = DataFrame(np_prop_averages, columns=cols)
                        df.set_index("T [K]", inplace=True)
                        df.columns.name = columns_name # legend name
                        df.attrs["name"] = f'{nm}({doping_type})'  # -> used as title by default
                        df.attrs["title"] = f'{title_prefix} of {doping_type}-type {titles[nm]}'
                        df.attrs["labels"] = {
                            "value": f'{nm}({doping_type}) [{u}]',  # y-axis label
                            #"variable": columns_name # alternative for df.columns.name
                        }
                        tables.append(df)

                        arr_prop_avg = np_prop_averages[:,1:] #[:,[4,8,12]]
                        max_v = np.max(arr_prop_avg)

                        if key[0] == 's' and doping_type == 'n':
                            max_v = np.min(arr_prop_avg)
                        if key[0] == 'k':
                            max_v = np.min(arr_prop_avg)

                        arg_max = np.argwhere(arr_prop_avg==max_v)[0]
                        elabel = f'{nm}ᵉ'
                        cdata[f'{elabel}.{doping_type}'] = unflatten({
                            'v': f"{max_v} {u}",
                            'T': f"{temps[arg_max[0]]} K",
                            'c': f"{dopings[arg_max[1]]} cm⁻³"
                        })

        contrib = {'project': name, 'identifier': identifier, 'is_public': True}
        contrib["data"] = unflatten(cdata)
        contrib["tables"] = tables
        contributions.append(contrib)
    
len(contributions)

In [None]:
with open("carrier_transport_p-type-update.json", "w") as f:
    json.dump(contributions, f)

### Submit/Update contributions

In [None]:
import json

with open("carrier_transport_p-type-update.json", "r") as f:
    contributions = json.load(f)
    
len(contributions)

In [None]:
from mpcontribs.client import Client

name = "carrier_transport"

with Client() as client:
    query = {"project": name, "data__functional__exact": ""}  # data.functional not set after rename type -> functional
    ids_map = client.get_all_ids(query, fmt="map").get(name)

len(ids_map)  # = number of contributions to be updated

In [None]:
# include contribution IDs to request update
# and reduce contrib dicts to update keys
submit = []

for contrib in contributions:
    pk = ids_map.get(contrib["identifier"], {}).get("id")
    if pk:
        submit.append({"data": {
            k: {
                kk: vv
                for kk, vv in v.items()
                if kk == "p"
            } if isinstance(v, dict) else v
            for k, v in contrib["data"].items()
            if k == "functional" or "ᵉ" in k
        }})
        submit[-1]["id"] = pk

len(submit)

In [None]:
with Client() as client:
    #client.delete_contributions(name)
    #client.init_columns(name, columns)
    client.submit_contributions(submit, ignore_dupes=True)

### Query contributions

In [None]:
from mpcontribs.client import Client

query = {
    "project": "carrier_transport",
#     "formula_contains": "ZnS",
#    "identifier__in": ["mp-10695", "mp-760381"], # ZnS, CuS
    "data__functional__exact": "GGA+U",
    "data__metal__contains": "Y",
    "data__mₑᶜ__p__ε̄__value__gte": 1000,
    "_order_by": "data__mₑᶜ__p__ε̄__value",
    "order": "desc",
    "_fields": ["id", "identifier", "formula", "data.mₑᶜ.p.ε̄.value"]
}

with Client() as client:
    result = client.contributions.get_entries(**query).result()
    
result

### Generate snapshots / downloads [optional]

In [None]:
from mpcontribs.client import Client

client = Client()

In [None]:
query = {
    "project": "carrier_transport",
    "formula__contains": "Zn",
}

print(client.get_totals(query=query))
query["format"] = "json" # "csv" or "json"
client.download_contributions(query) #, include=["tables"])

In [None]:
client.session.close()

### (Re-)build notebooks [optional]