In [1]:
import numpy as np
import os
import pandas as pd

import random

In [2]:
## load preprocessed feature datasets
dielect = pd.read_csv("./dielectric_feature.csv")
kvrh = pd.read_csv("./kvrh_features.csv")
gvrh = pd.read_csv("./gvrh_features.csv")
perov = pd.read_csv("./perov_features.csv")

mat_dielect = pd.read_csv("./mat_dielectric_feature.csv")
mat_elast = pd.read_csv("./mat_elast_feature.csv")

In [3]:
dielect

Unnamed: 0,composition,target,1,2,3,4,5,6,7,8,...,12,13,14,15,16,17,18,19,20,21
0,KS,1.752064,45.500000,35.581650,362.445000,8.500000,3.500000,154.000000,1.700000,1.500000,...,3.500000,0.500000,1.000000,0.000000,0.0,1.500000,49.446771,1.101000,0.000000,149.500000
1,K3VO4,1.652859,35.000000,42.132537,1224.548750,4.875000,3.750000,160.875000,1.552500,1.625000,...,3.625000,0.375000,0.250000,3.500000,0.0,4.125000,35.058125,0.000000,0.000000,201.875000
2,Rb2ZrO3,1.867858,58.833333,52.652333,788.810000,9.500000,3.500000,128.000000,2.300000,1.833333,...,4.500000,0.166667,1.000000,2.666667,0.0,3.833333,27.404583,0.000000,0.000000,108.833333
3,MnOF,2.676887,77.333333,29.978616,542.433333,13.333333,2.666667,87.333333,2.990000,2.000000,...,6.666667,0.000000,1.000000,1.666667,0.0,2.666667,9.766695,0.656667,0.000103,81.333333
4,Li2CoSiO4,1.793232,64.500000,31.643599,1349.061250,11.375000,3.000000,111.250000,1.972500,1.875000,...,5.125000,0.125000,2.250000,0.750000,0.0,3.125000,15.993542,0.386500,0.387118,192.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3987,Cr2O5,2.458294,76.142857,26.284171,662.000000,13.142857,2.571429,86.857143,2.931429,1.714286,...,6.000000,0.285714,1.428571,1.428571,0.0,3.142857,9.700714,0.000000,0.000000,74.000000
3988,Ca2FeWO6,2.136837,48.200000,61.506540,1684.580000,7.400000,4.000000,137.200000,1.878000,2.000000,...,7.800000,0.000000,0.200000,3.000000,0.0,3.200000,16.507500,0.000000,1.266398,206.500000
3989,La4MnS6O,2.690619,69.250000,40.559921,665.513333,11.916667,3.083333,105.333333,2.543333,2.000000,...,6.083333,0.000000,1.166667,2.416667,0.0,3.583333,13.272060,0.183500,0.000103,100.333333
3990,BaAg2GeSe4,2.811494,55.750000,107.215850,1080.640000,9.750000,5.000000,156.250000,1.757500,1.500000,...,9.750000,0.500000,0.750000,0.000000,0.0,1.250000,30.178125,0.147750,0.000000,199.625000


In [4]:
def ySelection(df):
    '''
    Select OOD sets according to the target value
    ##########
    Arguments:
    df: loaded preprocessing data
    '''
    df = df.sort_values(by=['target'])
    #print(df)  ## from small to large
    
    if len(df) > 5000:
        df_train = df[:4500]
        df_ood = df[-500:]
        
    else:
        lent = int(len(df)*0.9)
        df_train = df[:lent]
        df_ood = df[lent:]
        
    print(f"length of training:{len(df_train)}; length of ood: {len(df_ood)}")
    return df_train, df_ood

In [5]:
dielect_train, dielect_ood = ySelection(dielect)

length of training:3592; length of ood: 400


In [6]:
kvrh_train, kvrh_ood = ySelection(kvrh)

length of training:4500; length of ood: 500


In [7]:
gvrh_train, gvrh_ood = ySelection(gvrh)

length of training:4500; length of ood: 500


In [8]:
perov_train, perov_ood = ySelection(perov)

length of training:4500; length of ood: 500


In [9]:
mat_dielect_train, mat_dielect_ood = ySelection(mat_dielect)

length of training:867; length of ood: 97


In [10]:
mat_elast_train, mat_elast_ood = ySelection(mat_elast)

length of training:978; length of ood: 109


In [11]:
def saveCSV(df_train, df_ood, name):    '''
    Select OOD sets according to the target value
    ##########
    Arguments:
    f: data
    targetname: when we have two same formulas, we should choose the one with low/ high/random target.
    '''
    df_train = df_train[["composition", "target"]].reset_index().rename(columns={'index':'material_id'})
    df_ood = df_ood[["composition", "target"]].reset_index().rename(columns={'index':'material_id'})
    
    df_train = df_train.sort_values(by=['material_id'])
    df_ood = df_ood.sort_values(by=['material_id'])
    
    file_path = os.path.join("./propertySelection/", name)
    if not os.path.exists(file_path):
        os.makedirs(file_path)
        
    df_train.to_csv(os.path.join("./propertySelection/", name, f"{name}.csv"), index=None)
    df_ood.to_csv(os.path.join("./propertySelection/", name, f"{name}_ood.csv"), index=None)

In [12]:
saveCSV(dielect_train, dielect_ood, "dielect")

In [13]:
saveCSV(kvrh_train, kvrh_ood, "kvrh")

In [14]:
saveCSV(gvrh_train, gvrh_ood, "gvrh")

In [15]:
saveCSV(perov_train, perov_ood, "perov")

In [16]:
saveCSV(mat_dielect_train, mat_dielect_ood, "matminer_dielect")

In [17]:
saveCSV(mat_elast_train, mat_elast_ood, "matminer_elast")