In [4]:
from collections import defaultdict
from bson.objectid import ObjectId
from datetime import datetime
from pymongo import MongoClient
from tqdm import tqdm
from colabfit.tools.database import MongoDatabase

# client = MongoDatabase(database_name="mp", uri="mongodb://localhost:27017/")
client = MongoClient("mongodb://localhost:27017/")
db = client["mp"]
from pprint import pprint


In [33]:
def get_soft_meth(
    batch_size,
    last,
    typ,
):
    piped = db.property_instances.aggregate(
        [
            {"$sort": {"hash": 1}},
            # Include only PI objects after the last object from previous batch
            {"$match": {"hash": {"$gt": last}, "type": typ}},
            {"$limit": batch_size},
            # Split on each MD-id and join MDs and DOs
            {"$unwind": "$relationships.metadata"},
            {
                "$lookup": {
                    "from": "metadata",
                    "localField": "relationships.metadata",
                    "foreignField": "colabfit-id",
                    "as": "md_data",
                }
            },
            {
                "$lookup": {
                    "from": "data_objects",
                    "localField": "md_data.colabfit-id",
                    "foreignField": "colabfit-id",
                    "as": "data_object",
                }
            },
            # Match only returned docs that don't point to OC20
            {
                "$match": {
                    "data_object.relationships.datasets": {"$ne": ["DS_ifdjgm9le1fd_0"]}
                }
            },
            # Regroup objects based on original hash (PI hash)
            {
                "$group": {
                    "_id": "$hash",
                    "md_ids": {"$push": "$relationships.metadata"},
                    "datasets": {"$push": "$data_object.relationships.datasets"},
                    "do_ids": {"$push": "$relationships.data_objects"},
                    "method": {"$push": "$md_data.method.source-value"},
                    "software": {"$push": "$md_data.software.source-value"},
                }
            },
            # _id should be hash at this point
            {
                "$project": {
                    "_id": "$_id",
                    "md_ids": {"$size": "$md_ids"},
                    "datasets": "$datasets",
                    "do_ids": {"$size": "$do_ids"},
                    "method": "$method",
                    "software": "$software",
                }
            },
        ]
    )
    return piped

In [34]:
last = "0"
p = get_soft_meth(1000, last, "free-energy")

In [35]:
list(p)

[{'_id': '1252750982692086691',
  'md_ids': 1,
  'datasets': [[]],
  'do_ids': [['DO_768234264842078372']],
  'method': [['GGA Structure Optimization, LAMMPS']],
  'software': [['VASP']]},
 {'_id': '1086273480105862448',
  'md_ids': 1,
  'datasets': [[]],
  'do_ids': [['DO_554626726064232351']],
  'method': [['GGA Structure Optimization, LAMMPS']],
  'software': [['VASP']]},
 {'_id': '1013786439798831165',
  'md_ids': 1,
  'datasets': [[]],
  'do_ids': [['DO_1422523639936126874']],
  'method': [['GGA Structure Optimization, LAMMPS']],
  'software': [['VASP']]},
 {'_id': '1054716185670268287',
  'md_ids': 1,
  'datasets': [[]],
  'do_ids': [['DO_779412146730342494']],
  'method': [['GGA Structure Optimization, LAMMPS']],
  'software': [['VASP']]},
 {'_id': '1253493723669628376',
  'md_ids': 1,
  'datasets': [[]],
  'do_ids': [['DO_1694411106619904100']],
  'method': [['GGA Structure Optimization, LAMMPS']],
  'software': [['VASP']]},
 {'_id': '110211094187660785',
  'md_ids': 1,
  'data

In [11]:
def update_ms(ms_data, soft_dict, meth_dict):
    last_hash = None
    for data in ms_data:
        last_hash = data.get("hash")
        print(last_hash)

        if last_hash is None:
            return last_hash
        meth = data["method"][0]
        soft = data["software"][0]
        do_len = data["do_ids"]
        md_len = data["md_ids"]
        # If the number of md-ids == num of do-ids
        if do_len == md_len:
            if len(meth) == 0:
                meth_dict["None"] += do_len
            else:
                if len(meth) == 1:
                    meth_dict[meth[0]] += do_len
                elif len(meth) == do_len:
                    for m in meth:
                        meth_dict[m] += 1
                else:
                    meth_dict["unequal_meth_domd"] += do_len
            if len(soft) == 0:
                soft_dict["None"] += do_len
            else:
                if len(soft) == 1:
                    soft_dict[soft[0]] += do_len
                elif len(soft) == do_len:
                    for s in soft:
                        soft_dict[s] += 1
                else:
                    soft_dict["unequal_soft_domd"] += do_len
        else:
            soft_dict["unequal_do_md"] += 1
            meth_dict["unequal_do_md"] += 1

    return last_hash


In [12]:
methods = defaultdict(int)
software = defaultdict(int)

In [13]:
n_pis = db.property_instances.estimated_document_count()
b_size = 10000
n_batches = n_pis // b_size
if n_batches < 1:
    n_batches = 1


In [14]:
last = "0"

for batch in range(n_batches):
    # skip = batch * b_size
    data = get_soft_meth(b_size, last, "free-energy")
    last = update_ms(data, software, methods)
    print(last)
    if last is None:
        break

['1075917006321976688']
['1417275517245933863']
['1986628197588195455']
['2022106477913482911']
['278636612102821626']
['973200486531098149']
['1160603534470347619']
['797026163269147389']
['1104951930874766996']
['1240304056749503987']
['2091879208757161828']
['131900034687355423']
['4417128906537099']
['534741730087470499']
['2155060061625907114']
['1192607884081737963']
['1951715734823108028']
['1485613257524443710']
['14895715349293371']
['2171371341582731669']
['580508354599052558']
['1779219279956953483']
['2298104059707138378']
['2113708708589733517']
['2129053942717381765']
['2073004511865150386']
['882320489198272698']
['314396155819452418']
['118919718680048742']
['1556257339856590657']
['116521624995837949']
['919222509287730947']
['98026444805532972']
['613259802014306711']
['2043996084685216833']
['2257497529764232296']
['1358134361844754882']
['3108632162804655']
['503112666233395979']
['886880970735094627']
['2096608528939816544']
['280346193828259398']
['906010251100191

In [15]:
methods

defaultdict(int,
            {'GGA Structure Optimization': 3923,
             'GGA+U Structure Optimization': 2640,
             'R2SCAN Structure Optimization': 513,
             'SCAN Structure Optimization': 213,
             'PBESol Structure Optimization': 389})

In [8]:
software

defaultdict(int, {'VASP': 7678})

In [None]:
pe = []
# for batch in range(n_batches):
#     skip = batch * b_size
# print(skip)
data = db.property_instances.aggregate(
    [
        {"$match": {"type": "potential-energy"}},
        # {"$limit": 1},
        {"$count": "totalDocuments"},
    ]
)
# update_ms(data, software, methods)
print(list(data))

In [None]:
with open("software_no_oc.txt", "a") as f:
    f.write(str(software))
with open("methods_no_oc.txt", "a") as f:
    f.write(str(methods))

In [None]:
def main():
    methods = defaultdict(int)
    software = defaultdict(int)
    n_pis = db.property_instances.estimated_document_count()
    b_size = 1000000
    n_batches = n_pis // b_size
    # remain = n_pis % b_size
    for batch in range(n_batches):
        # skip = batch * b_size
        print(skip)
        data = get_soft_meth(b_size, skip, typ)
        update_ms(data, software, methods)

In [None]:
with open("pub_dois_2023-6-8.txt", "r") as f:
    dois = f.readlines()


In [27]:
with open("stats_files/methods2_no_oc_potential-energy_calc.txt", "r") as f:
    methods = eval(f.read())


In [29]:
with open("stats_files/methods_no_oc_potential-energy_l.txt", "w") as f:
    f.writelines(sorted([f"{key}, {val}\n" for key, val in methods.items()]))


In [26]:
with open("stats_files/software2_no_oc_potential-energy_calc.txt", "r") as f:
    software = eval(f.read())
with open("stats_files/software_no_oc_potential-energy_l.txt", "w") as f:
    f.writelines(sorted([f"{key}, {val}\n" for key, val in software.items()]))


In [30]:
sum(software.values())

72772871

In [31]:
sum(methods.values())

72772871

In [34]:
with open("methods2_no_oc_free_energy_.txt", "r") as f:
    data = eval(f.read())
with open("stats_files/methods2_no_oc_free-energy_calc.txt", "w") as f:
    f.writelines(sorted([f"{key}, {val}\n" for key, val in data.items()]))

In [35]:
with open("software2_no_oc_free_energy_.txt", "r") as f:
    data = eval(f.read())
with open("stats_files/software2_no_oc_free-energy_calc.txt", "w") as f:
    f.writelines(sorted([f"{key}, {val}\n" for key, val in data.items()]))