In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from pathlib import Path

from lib.config import AppConfig
from lib.reproduction import major_oxides

config = AppConfig()

In [None]:
ds_path = Path(config.ccam_composition_data_path)
ccam_comp_MO = [f"{mo} (wt%)" for mo in major_oxides]

ccam_comp_data = pd.read_csv(ds_path, skiprows=1)

# remove rows with missing data in any of the major oxides
# ccam_comp_data = ccam_comp_data.dropna(subset=ccam_comp_MO)

# print number of rows with nan or null values
print(ccam_comp_data[ccam_comp_MO].isnull().sum())


ccam_comp_data = ccam_comp_data.rename(columns={mowt: mo for (mowt, mo)  in zip(ccam_comp_MO, major_oxides)})

In [None]:
# print all rows with non-null values
print(ccam_comp_data.shape)
ccam_comp_data.head()

In [None]:
from build.lib.lib.data_handling import CompositionData

cd = CompositionData(config.composition_data_path).composition_data
cd = cd[cd[major_oxides].notnull().all(axis=1)]
print(cd.shape)
cd.head()

In [None]:
print(
    f"Total number of rows with null values in new: {ccam_comp_data[major_oxides].isnull().any(axis=1).sum()}"
)

In [None]:
df = pd.merge(cd, ccam_comp_data, on=['Target'], how='inner', suffixes=('_cd', '_ccam'))

In [None]:
new_cols = [f"{mo}_ccam" for mo in major_oxides]
old_cols = [f"{mo}_cd" for mo in major_oxides]
new = df[['Target'] + new_cols].copy()
new.rename(columns={f"{mo}_ccam": mo for mo in major_oxides}, inplace=True)
old = df[['Target'] + old_cols].copy()
old.rename(columns={f"{mo}_cd": mo for mo in major_oxides}, inplace=True)

df[['Target'] + new_cols + old_cols]

# print # of cols that are equivalent
print(f"Number of equivalent rows: {new.eq(old).all(axis=1).sum()}")

# print # of cols that are not equivalent
print(f"Number of non-equivalent rows: {new.eq(old).all(axis=1).sum()}")

# print df rows that are nan in old and not in new
print(f"Number of rows that have nan values in old and not in new: {df[old_cols][old.isnull().any(axis=1) & new.notnull().all(axis=1)].shape[0]}")
    # df[old_cols][old.isnull().any(axis=1) & new.notnull().all(axis=1)].shape[0]

# print df rows that are nan in new and not in old
print(
    f"Number of rows that have nan values in new and not in old: {df[new_cols][new.isnull().any(axis=1) & old.notnull().all(axis=1)].shape[0]}"
)
# df[['Target'] + new_cols + old_cols][new.isnull().any(axis=1) & old.notnull().all(axis=1)]

# print rows that are null in new
print(
    f"Number of rows that are null in new (after migration): {df[new_cols][new.isnull().any(axis=1)].shape[0]}"
)

In [None]:
amount_of_repeat_values = df['Target'].size
print("% repeat values: ", f"{amount_of_repeat_values} / {ccam_comp_data['Target'].size} = {amount_of_repeat_values / ccam_comp_data['Target'].size * 100:.2f}%")

In [None]:
left_join = pd.merge(ccam_comp_data, cd, on='Target', how='left', indicator=True)
rows_in_ccam_not_in_cd = left_join[left_join['_merge'] == 'left_only']
rows_in_ccam_not_in_cd = rows_in_ccam_not_in_cd.drop(columns=['_merge'])

rows_in_ccam_not_in_cd[["Target"]]