In [1]:
from pathlib import Path

import pandas as pd
from dotenv import dotenv_values
from sklearn.model_selection import train_test_split

from lib.data_handling import CompositionData, load_data
from lib.reproduction import folder_to_composition_sample_name, major_oxides

In [2]:
env = dotenv_values()
comp_data_loc = env.get("COMPOSITION_DATA_PATH")
dataset_loc = env.get("DATA_PATH")

if not comp_data_loc:
    print("Please set COMPOSITION_DATA_PATH in .env file")
    exit(1)

if not dataset_loc:
    print("Please set DATA_PATH in .env file")
    exit(1)

cd = CompositionData(composition_data_loc=comp_data_loc)
cd.composition_data.head()

save_path = Path("train_test_split.csv")

# quick and easy way to get the list of samples we have
data = load_data(dataset_loc)

Loading data: 100%|██████████| 414/414 [01:31<00:00,  4.51it/s]


In [6]:
data

{'0.1tio2': {'2015_03_27_132008_ccs':            wave      shot_avg
  0     240.81100  1.090901e+16
  1     240.86501  4.219430e+12
  2     240.91800  4.468543e+11
  3     240.97200  1.067514e+12
  4     241.02699  2.104724e+12
  ...         ...           ...
  6139  904.80188  3.335123e+09
  6140  904.99481  2.574233e+09
  6141  905.18768  2.734626e+07
  6142  905.38062  5.999371e+09
  6143  905.57349  1.260760e+10
  
  [6144 rows x 2 columns],
  '2015_03_27_132210_ccs':            wave      shot_avg
  0     240.81100  1.348974e+16
  1     240.86501  4.860644e+12
  2     240.91800  5.066965e+11
  3     240.97200  1.306603e+12
  4     241.02699  2.626375e+12
  ...         ...           ...
  6139  904.80188  3.316790e+09
  6140  904.99481  2.410871e+09
  6141  905.18768  3.865346e+07
  6142  905.38062  6.812226e+09
  6143  905.57349  1.430865e+10
  
  [6144 rows x 2 columns],
  '2015_03_27_132331_ccs':            wave      shot_avg
  0     240.81100  1.090794e+16
  1     240.86501  4.1

In [3]:
def get_composition_for_sample(cd, sample_name):
    sample_name_lower = sample_name.lower()
    match_condition = (
        (cd["Spectrum Name"].str.lower() == sample_name_lower)
        | (cd["Target"].str.lower() == sample_name_lower)
        | (cd["Sample Name"].str.lower() == sample_name_lower)
    )
    composition = cd.loc[match_condition]

    return composition.head(1)

In [4]:
# if save_path.exists():
#     print("train_test_split.csv already exists. Skipping...")
#     exit()

# get list of samples we have that were used for 2015 calibration
samples_used_2015 = []
for sample in data.keys():
    sample_name = folder_to_composition_sample_name.get(sample, sample)
    composition = get_composition_for_sample(cd.composition_data, sample_name)

    if composition.empty:
        continue

    # drop samples with NaNs for any of the oxides
    if composition[major_oxides].isnull().values.any():
        continue

    used_2015 = composition["Used for 2015 calibration"].values[0]
    if used_2015 == 1:
        samples_used_2015.append(sample)

train, test = train_test_split(samples_used_2015, test_size=0.2, random_state=42)

train_df = pd.DataFrame(train, columns=["sample_name"])
train_df["train_test"] = "train"

test_df = pd.DataFrame(test, columns=["sample_name"])
test_df["train_test"] = "test"

train_test_df = pd.concat([train_df, test_df])
train_test_df.to_csv(str(save_path), index=False)