In [1]:
from lib.data_handling import CompositionData
from dotenv import dotenv_values

env = dotenv_values()
comp_data_loc = env.get('COMPOSITION_DATA_PATH')
dataset_loc = env.get('DATA_PATH')

if not comp_data_loc:
    print("Please set COMPOSITION_DATA_PATH in .env file")
    exit(1)

if not dataset_loc:
    print("Please set DATA_PATH in .env file")
    exit(1)

cd = CompositionData(composition_data_loc=comp_data_loc)
cd.composition_data.head()

Unnamed: 0,Target,Spectrum Name,Sample Name,SiO2,TiO2,Al2O3,FeOT,MnO,MgO,CaO,Na2O,K2O,MOC total,Used for 2015 calibration,Used for 2021 Mn calibration,Used for 2022 Li calibration
0,AGV2,AGV2,AGV2,59.3,1.05,16.91,6.02,0.099,1.79,5.2,4.19,2.88,97.44,1.0,1.0,1.0
1,BCR-2,BCR2,BCR2,54.1,2.26,13.5,12.42,0.2,3.59,7.12,3.16,1.79,98.14,1.0,1.0,1.0
2,BEN,BEN,BEN,38.2,2.61,10.07,11.61,0.2,13.15,13.87,3.18,1.39,94.28,1.0,1.0,1.0
3,BHVO2,BHVO2,BHVO2,49.9,2.73,13.5,11.07,0.167,7.23,11.4,2.22,0.52,98.74,1.0,1.0,1.0
4,BIR-1a,BIR1,BIR1,47.7,0.97,15.4,10.19,0.176,9.7,13.4,1.81,0.03,99.38,1.0,1.0,1.0


In [2]:
column = "Used for 2015 calibration"

cd.composition_data[column].value_counts()

Used for 2015 calibration
1.0    404
0.0    168
Name: count, dtype: int64

In [3]:
from lib.data_handling import load_data

data = load_data(
    average_shots=True,
    dataset_loc=dataset_loc,
    num_samples=1,
)

Loading data:   0%|          | 0/1 [00:00<?, ?it/s]

Loading data: 100%|██████████| 1/1 [00:00<00:00,  3.28it/s]


In [12]:
sample = data["jsc1399"][0]

import pingouin as pg

pg.multivariate_normality(sample[:50], alpha=0.05)

HZResults(hz=200, pval=1.394838372684144e-59, normal=False)

In [10]:
len(data)

414

In [13]:
def get_composition_for_sample(cd, sample_name):
    sample_name_lower = sample_name.lower()
    match_condition = (
        (cd["Spectrum Name"].str.lower() == sample_name_lower)
        | (cd["Target"].str.lower() == sample_name_lower)
        | (cd["Sample Name"].str.lower() == sample_name_lower)
    )
    composition = cd.loc[match_condition]

    return composition.head(1)

In [44]:
samples_used_2015 = []

# Manual mapping of sample names to composition names for samples that don't match folder names
# Sample (dir) name -> composition name
manual_map = {
    "vs211681": "VS-2116-81",
    "sancs2": "SANC-S",  # guess due to text in csv describing it, and it's the only unused SAN-C
    "sh5": "SH-5",
    "7tio2": "",
    "sh59": "SH-59",
    "75tio2": "",
    "ncsdc28041": "NCS-DC28041",
    "sanck": "SanC-K",
    "3tio2": "",
    "sanca": "SanC-A",
    "sh73": "SH-73",
    "50tio2": "",
    "10tio2": "",
    "ncsdc47008": "NCS-DC47008",
    "nau1": "NAU-1",
    "sancb": "SanC-B",
    "kga2meds": "KGa-2-med-S",
    "nat18": "NAT-18",
    "idbdf": "ID_BDF",
    "5tio2": "",
    "sancc": "SanC-C",
    "sancj": "SanC-J",
    "0.1tio2": "",
    "nau2los": "NAu-2-low-s",
    "icel009010": "icel009-010",
    "nau2meds": "NAu-2-mid-s",
    "ncsdc47009": "NCS-DC47009",
    "sanci": "SanC-I",
    "25tio2": "",
    "1tio2": "",
    "bir1a": "BIR-1a",
}

unused_compositions = set()
cd.composition_data["Check"] = None

for sample in data.keys():
    composition = get_composition_for_sample(cd.composition_data, sample)
    if composition.empty:
        print(f"Could not find composition for {sample}")
        continue

    # 1 if used for 2015 calibration, 0 otherwise
    used_2015 = composition["Used for 2015 calibration"].values[0]
    if used_2015 == 1:
        samples_used_2015.append(sample)
        # mark composition as used
        composition["Check"] = 1

# check unused compositions
unused_compositions = cd.composition_data.loc[cd.composition_data["Check"].isna()]

len(samples_used_2015), len(data)
unused_compositions.dropna(axis=0, how="all")

unused_compositions.to_csv("unused_compositions.csv")

Could not find composition for vs211681
Could not find composition for sancs2
Could not find composition for sh5
Could not find composition for 7tio2
Could not find composition for sh59
Could not find composition for 75tio2
Could not find composition for ncsdc28041
Could not find composition for sanck
Could not find composition for 3tio2
Could not find composition for sanca
Could not find composition for sh73
Could not find composition for 50tio2
Could not find composition for 10tio2
Could not find composition for ncsdc47008
Could not find composition for nau1
Could not find composition for sancb
Could not find composition for kga2meds
Could not find composition for nat18
Could not find composition for idbdf
Could not find composition for 5tio2
Could not find composition for sancc
Could not find composition for sancj
Could not find composition for 0.1tio2
Could not find composition for nau2los
Could not find composition for icel009010
Could not find composition for nau2meds
Could not f

In [62]:
# find unusual compositions spectrum name contains 25
unused_compositions['Spectrum Name'] = unused_compositions['Spectrum Name'].fillna('')

# Now apply the str.contains() method
mask = unused_compositions["Spectrum Name"].str.contains("75", False)

# Apply the mask to the DataFrame
filtered_df = unused_compositions[mask]
filtered_df

Unnamed: 0,Target,Spectrum Name,Sample Name,SiO2,TiO2,Al2O3,FeOT,MnO,MgO,CaO,Na2O,K2O,MOC total,Used for 2015 calibration,Used for 2021 Mn calibration,Used for 2022 Li calibration,Check
118,1375,JSC1375,CA9KRY1,66.08,0.45,14.14,3.3,0.069,2.59,3.62,3.44,3.1,96.79,1.0,1.0,0.0,
545,VS72875,VS72875,VS728-75,40.18,0.24,28.5,2.92,0.04,1.01,7.13,12.2,3.3,95.52,0.0,1.0,0.0,


In [28]:
from sklearn.model_selection import train_test_split


train, test = train_test_split(samples_used_2015, test_size=0.2, random_state=42)
len(train), len(test)

(306, 77)

In [30]:
# make pd.dataframe with columns: sample_name, train/test
# save to file
import pandas as pd

train_df = pd.DataFrame(train, columns=["sample_name"])
train_df["train_test"] = "train"

test_df = pd.DataFrame(test, columns=["sample_name"])
test_df["train_test"] = "test"

train_test_df = pd.concat([train_df, test_df])
train_test_df.head()

Unnamed: 0,sample_name,train_test
0,gypa,train
1,jsc1414,train
2,mix5b,train
3,dkm2990,train
4,130,train


In [31]:
train_test_df.to_csv("data/train_test_split.csv", index=False)

In [64]:
train_samples = pd.read_csv("./train_test_split.csv")
train_samples

Unnamed: 0,sample_name,train_test
0,dkm1190,train
1,gbw07312,train
2,r65,train
3,gbw07105,train
4,jsc1455,train
...,...,...
399,jsc1433,test
400,m7mt,test
401,pw7,test
402,jsc1370,test


In [74]:
row = train_samples[train_samples["sample_name"] == "vs211681"]

In [77]:
row.empty, row["train_test"].values[0]

(False, 'train')