In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from pathlib import Path

from lib.config import AppConfig
from lib.reproduction import major_oxides

config = AppConfig()

In [None]:
ds_path = Path(config.ccam_composition_data_path)
ccam_comp_MO = [f"{mo} (wt%)" for mo in major_oxides]

ccam_comp_data = pd.read_csv(ds_path, skiprows=1)

# remove rows with missing data in any of the major oxides
ccam_comp_data = ccam_comp_data.dropna(subset=ccam_comp_MO)

# print number of rows with nan or null values
print(ccam_comp_data[ccam_comp_MO].isnull().sum())


ccam_comp_data = ccam_comp_data.rename(columns={mowt: mo for (mowt, mo) in zip(ccam_comp_MO, major_oxides)})

In [None]:
# print all rows with non-null values
print(ccam_comp_data.shape)
ccam_comp_data.head()

In [None]:
from build.lib.lib.data_handling import CompositionData

cd = CompositionData(config.composition_data_path).composition_data
cd = cd[cd[major_oxides].notnull().all(axis=1)]
print(cd.shape)
cd.head()

In [None]:
print(f"Total number of rows with null values in new: {ccam_comp_data[major_oxides].isnull().any(axis=1).sum()}")

In [None]:
df = pd.merge(cd, ccam_comp_data, on=["Target"], how="inner", suffixes=("_cd", "_ccam"))

In [None]:
from typing import Any, Dict


def p(obj: Dict[str, Any]):
    for k, v in obj.items():
        print(f"{k}: {v}")

In [None]:
new_cols = [f"{mo}_ccam" for mo in major_oxides]
old_cols = [f"{mo}_cd" for mo in major_oxides]
new = df[["Target"] + new_cols].copy()
new.rename(columns={f"{mo}_ccam": mo for mo in major_oxides}, inplace=True)
old = df[["Target"] + old_cols].copy()
old.rename(columns={f"{mo}_cd": mo for mo in major_oxides}, inplace=True)

df[["Target"] + new_cols + old_cols]

p(
    {
        "Number of equivalent rows": new.eq(old).all(axis=1).sum(),
        "Number of non-equivalent rows": new.eq(old).all(axis=1).sum(),
        "Number of rows that have nan values in old and not in new": df[old_cols][
            old.isnull().any(axis=1) & new.notnull().all(axis=1)
        ].shape[0],
        "Number of rows that have nan values in new and not in old": df[new_cols][
            new.isnull().any(axis=1) & old.notnull().all(axis=1)
        ].shape[0],
        "Number of rows that are null in new (after migration)": df[new_cols][
            new.isnull().any(axis=1)
        ].shape[0],
    }
)

In [None]:
amount_of_repeat_values = df["Target"].size
ccam_size = ccam_comp_data["Target"].size

p(
    {
        "% repeat values": f"{amount_of_repeat_values} / {ccam_size} = {amount_of_repeat_values / ccam_size * 100:.2f}%"
    }
)

In [None]:
left_join = pd.merge(ccam_comp_data, cd, on="Target", how="left", indicator=True)
rows_in_ccam_not_in_cd = left_join[left_join["_merge"] == "left_only"]
rows_in_ccam_not_in_cd = rows_in_ccam_not_in_cd.drop(columns=["_merge"])

assert (ccam_size - amount_of_repeat_values) == len(
    rows_in_ccam_not_in_cd[["Target"]]
), f"{ccam_size - amount_of_repeat_values} != {len(rows_in_ccam_not_in_cd[['Target']])}"

p(
    {
        "Number of rows in ccam not in cd": len(rows_in_ccam_not_in_cd),
    }
)

In [None]:
# rename ccam columns to major oxides from ccam_comp_MO
ccam_comp_data_renamed = ccam_comp_data.rename(columns={mowt: mo for (mowt, mo) in zip(ccam_comp_MO, major_oxides)})

In [None]:
from typing import List
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def prepare_data_for_plotting(df1: pd.DataFrame, df2: pd.DataFrame, label_col_name='Dataset', df1_name="Dataset 1", df2_name="Dataset 2"):
    """
    Prepare data by combining two dataframes with an additional column to label each row by its originating dataset.
    
    Parameters:
    - df1: The first DataFrame.
    - df2: The second DataFrame.
    - label_col_name: The name of the column to be added to distinguish between the datasets.
    
    Returns:
    A new DataFrame with the data from both input DataFrames and an additional column labeling the rows by dataset.
    """
    df1[label_col_name] = df1_name
    df2[label_col_name] = df2_name
    combined_df = pd.concat([df1, df2], ignore_index=True)
    return combined_df

def plot_boxplots(df: pd.DataFrame, columns: List[str], label_col_name='Dataset'):
    """
    Generates box plots for each specified column, separated by dataset.
    
    Parameters:
    - df: DataFrame containing the data to plot, including a label column distinguishing between datasets.
    - columns: List of column names to plot.
    - label_col_name: The name of the column used to distinguish between datasets.
    """
    plt.figure(figsize=(20, 10))
    
    for index, column in enumerate(columns, start=1):
        plt.subplot(2, 4, index)  # Adjust the grid dimensions as needed
        sns.boxplot(x=label_col_name, y=column, data=df)
        plt.title(f'Box Plot of {column}')

        dataset_sizes = df.groupby(label_col_name)[column].size().reset_index(name='size')
        for i, row in dataset_sizes.iterrows():
            plt.text(i, plt.ylim()[1], f'n={row["size"]}', horizontalalignment='center', size='small', color='black', weight='semibold')
    
    
    plt.tight_layout()
    plt.show()

In [None]:
def clean_data(df):
    for column in df.columns:
        # Replace instances of '<' followed by any number with the number itself
        df[column] = df[column].astype(str).str.replace('<', '')
        # Convert all numbers to floats and errors to NaN (non-numeric values become NaN)
        df[column] = pd.to_numeric(df[column], errors='coerce')
        # Fill in a value just below the detection limit where necessary
    return df

In [None]:
df1 = clean_data(ccam_comp_data_renamed)
df2 = clean_data(cd)
# Example usage:
# Assume df1 and df2 are your two datasets
combined_df = prepare_data_for_plotting(df1, df2, 'Source', 'CCAM', 'PDS')
plot_boxplots(combined_df, major_oxides, 'Source')

In [None]:
def plot_density_and_cdf(df: pd.DataFrame, columns: List[str], label_col_name="Dataset"):
    num_columns = len(columns)
    fig, axes = plt.subplots(num_columns, 2, figsize=(20, 5 * num_columns))  # 2 plots per row: KDE and CDF

    for i, column in enumerate(columns):
        # KDE plot for the ith column
        ax_kde = axes[i, 0]
        for label, group_df in df.groupby(label_col_name):
            sns.kdeplot(group_df[column].dropna(), ax=ax_kde, label=label, shade=True)
        ax_kde.set_title(f"Density Plot of {column}")
        ax_kde.legend()

        # CDF plot for the ith column
        ax_cdf = axes[i, 1]
        for label, group_df in df.groupby(label_col_name):
            sns.ecdfplot(group_df[column].dropna(), ax=ax_cdf, label=label)
        ax_cdf.set_title(f"Cumulative Distribution of {column}")
        ax_cdf.legend()

    plt.tight_layout()
    plt.show()


def plot_proportional_counts(df: pd.DataFrame, label_col_name="Dataset"):
    df_counts = df[label_col_name].value_counts(normalize=True).reset_index()
    df_counts.columns = [label_col_name, "Proportion"]

    plt.figure(figsize=(10, 5))
    sns.barplot(x=label_col_name, y="Proportion", data=df_counts)
    plt.title("Proportional Counts of Datasets")
    plt.ylabel("Proportion of Total Data Points")
    plt.show()

In [None]:
plot_density_and_cdf(combined_df, major_oxides, 'Source')

In [None]:
plot_proportional_counts(combined_df, label_col_name='Source')

In [None]:
def plot_overlay_histograms(df: pd.DataFrame, columns: List[str], label_col_name='Dataset'):
    num_columns = len(columns)
    fig, axes = plt.subplots(1, num_columns, figsize=(20, 5))

    for i, column in enumerate(columns):
        for label, group_df in df.groupby(label_col_name):
            sns.histplot(group_df[column].dropna(), ax=axes[i], label=label, element='step', fill=False)
        axes[i].set_title(f'Overlay Histogram of {column}')
        axes[i].legend()

    plt.tight_layout()
    plt.show()

    
plot_overlay_histograms(combined_df, major_oxides, 'Source')

In [None]:
import numpy as np

def plot_correlation_heatmaps(df: pd.DataFrame, columns: List[str], label_col_name='Dataset'):
    fig, axes = plt.subplots(1, 2, figsize=(20, 7))
    for i, (label, group_df) in enumerate(df.groupby(label_col_name)):
        corr = group_df[columns].corr()
        mask = np.triu(corr)
        sns.heatmap(corr, ax=axes[i], annot=True, fmt=".2f", cmap=plt.cm.Reds, mask=mask)
        axes[i].set_title(f'Correlation Matrix for {label}')
    plt.tight_layout()
    plt.show()

plot_correlation_heatmaps(combined_df, major_oxides, 'Source')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_distributions(df: pd.DataFrame, columns: List[str], label_col_name='Dataset', df1_name="Dataset 1", df2_name="Dataset 2"):
    num_columns = len(columns)
    fig, axes = plt.subplots(num_columns, 2, figsize=(20, 5 * num_columns))  # 2 plots per row: Histogram and Step CDF

    # Get the max count for normalization of the histograms
    max_count = max(len(df[df[label_col_name] == df1_name]), len(df[df[label_col_name] == df2_name]))

    for i, column in enumerate(columns):
        # Histogram for the ith column
        ax_hist = axes[i, 0]
        sns.histplot(df[df[label_col_name] == df1_name][column], bins=30, kde=False, ax=ax_hist, label=df1_name, stat='count')
        sns.histplot(df[df[label_col_name] == df2_name][column], bins=30, kde=False, ax=ax_hist, label=df2_name, stat='count', color='orange')
        ax_hist.set_title(f'Histogram of {column}')
        ax_hist.legend()

        # Step CDF for the ith column with actual counts
        ax_cdf = axes[i, 1]
        sns.ecdfplot(df[df[label_col_name] == df1_name][column], ax=ax_cdf, label=df1_name, stat='count')
        sns.ecdfplot(df[df[label_col_name] == df2_name][column], ax=ax_cdf, label=df2_name, stat='count', color='orange')
        ax_cdf.set_title(f'Cumulative Count of {column}')
        ax_cdf.legend()

    plt.tight_layout()
    plt.show()

# Example usage:
# Ensure that 'combined_df' and 'major_oxides' are defined and properly formatted.
plot_distributions(combined_df, major_oxides, 'Source', 'CCAM', 'PDS')
