In [1]:
import pandas as pd
from pymatgen.core import Composition

# === Load Raw CSV ===
df = pd.read_csv("magnetic_materials.csv")  # Adjust filename if needed

df = df[['Material_Name', 'Curie']].dropna()

def extract_temperature(value):
    if pd.isna(value):
        return None
    try:
        # Remove units and parse float from e.g., "600 K ± 5", "580K", "298 °C"
        temp_str = str(value).replace("K", "")  #.replace("°C", "")
        return float(temp_str.split()[0])
    except:
        return None

df["Temperature_K"] = df["Curie"].apply(extract_temperature)

# === Rename Material column ===
df = df.rename(columns={"Material_Name": "Material"})
df_cleaned = df[["Material", "Temperature_K"]].dropna()
df_cleaned = df_cleaned[(df_cleaned["Temperature_K"] >= 0)]



print(len(df_cleaned))


df_cleaned.to_csv("cleaned_curie_temperatures.csv", index=False)

21024


  df = pd.read_csv("magnetic_materials.csv")  # Adjust filename if needed


In [2]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("cleaned_curie_temperatures.csv")

# Group by the 'Material' column and compute the mean of all numeric columns
df_grouped = df.groupby("Material", as_index=False).median(numeric_only=True)

# Save the deduplicated dataset
df_grouped.to_csv("cleaned_curie_temperatures_deduplicated.csv", index=False)

print("✅ Saved deduplicated dataset as 'cleaned_curie_temperatures_deduplicated.csv'")


✅ Saved deduplicated dataset as 'cleaned_curie_temperatures_deduplicated.csv'


In [3]:
import pandas as pd
from pymatgen.core import Composition
from tqdm import tqdm

# Load your dataset
df = pd.read_csv("cleaned_curie_temperatures_deduplicated.csv")

# Parse formulas safely
def get_fractional_composition(formula):
    try:
        comp = Composition(formula)
        return comp.fractional_composition.get_el_amt_dict()
    except:
        return {}

# Apply with progress bar
tqdm.pandas(desc="🧪 Parsing elemental fractions")
fraction_dicts = df["Material"].progress_apply(get_fractional_composition)

# Convert to dataframe (element columns)
fraction_df = pd.DataFrame(fraction_dicts.tolist()).fillna(0)

# Optional: prefix columns to avoid conflicts
fraction_df.columns = [f"el_frac_{el}" for el in fraction_df.columns]

# Merge into main dataframe
df_combined = pd.concat([df, fraction_df], axis=1)

# Save the new enriched file
df_combined.to_csv("composition_enriched_dataset.csv", index=False)
print("✅ Saved to composition_enriched_dataset.csv")


🧪 Parsing elemental fractions: 100%|█████████████████████████████████████████| 14704/14704 [00:00<00:00, 27129.04it/s]


✅ Saved to composition_enriched_dataset.csv


In [4]:
import pandas as pd
from pymatgen.core import Composition, Element
from tqdm import tqdm

# Load your dataset
df = pd.read_csv("composition_enriched_dataset.csv")  # or your path

# Setup tqdm for progress bar
tqdm.pandas()

# Define magnetic and rare earth elements
magnetic_elements = {'Fe', 'Co', 'Ni', 'Mn', 'Cr', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Nd', 'Sm'}
rare_earth_elements = {
    'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd',
    'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu'
}

# Function to compute domain-aware descriptors
def compute_domain_props(formula):
    try:
        comp = Composition(formula)
        total = sum(comp.get_el_amt_dict().values())
        el_dict = comp.get_el_amt_dict()

        mag_prop = sum(el_dict.get(el, 0) for el in magnetic_elements) / total
        rare_earth_prop = sum(el_dict.get(el, 0) for el in rare_earth_elements) / total
        return pd.Series({
            "Magnetic_proportion": mag_prop,
            "Rare_Earth_proportion": rare_earth_prop
        })
    except:
        return pd.Series({
            "Magnetic_proportion": None,
            "Rare_Earth_proportion": None
        })

# Apply the function
df[["Magnetic_proportion", "Rare_Earth_proportion"]] = df["Material"].progress_apply(compute_domain_props)

# Save updated dataset
df.to_csv("composition_enriched_dataset_with_domain_props.csv", index=False)
print("✅ Domain-aware descriptors added and file saved.")


100%|██████████████████████████████████████████████████████████████████████████| 14704/14704 [00:03<00:00, 4496.39it/s]


✅ Domain-aware descriptors added and file saved.


In [5]:
df = pd.read_csv("composition_enriched_dataset_with_domain_props.csv")

print(len(df))

14704


In [6]:
import pandas as pd
import numpy as np
from pymatgen.core import Composition
from tqdm import tqdm
from matminer.featurizers.composition import ElementProperty, Stoichiometry
import os

# === Step 1: Load Dataset ===
try:
    df = pd.read_csv("composition_enriched_dataset_with_domain_props.csv")
    print("✅ Loaded dataset:", df.shape)
except Exception as e:
    raise RuntimeError(f"❌ Failed to load CSV: {e}")

# === Step 2: Validate and Parse Formulas ===
def try_parse_formula(f):
    try:
        return Composition(f)
    except:
        return None

print("🔍 Parsing formulas...")
tqdm.pandas()
df["composition"] = df["Material"] .progress_apply(try_parse_formula)

before = len(df)
df = df.dropna(subset=["composition"])
after = len(df)
print(f"✅ Valid formulas: {after}/{before}")

print("✅ After base descriptor filtering:", df.shape)

# === Step 3: Chunk the Data ===
chunks = np.array_split(df, 5)
print(f"🔧 Total chunks: {len(chunks)}")

# === Step 4: Setup Output File ===
output_file = "valid_descriptor_dataset.csv"
if os.path.exists(output_file):
    os.remove(output_file)
print(f"🗂️ Output file cleared: {output_file}")

# === Step 5: Setup Featurizers (FAST ONLY FOR DEBUGGING) ===
featurizers = [
    ElementProperty.from_preset("magpie"),
    Stoichiometry()
]

for f in featurizers:
    if hasattr(f, "set_n_jobs"):
        f.set_n_jobs(1)

# === Step 6: Process Each Chunk and Save ===
for i, chunk in enumerate(tqdm(chunks, desc="⚙️ Featurizing")):
    print(f"🔹 Processing chunk {i+1} of size {len(chunk)}")
    try:
        for f in featurizers:
            chunk = f.featurize_dataframe(chunk, "composition", ignore_errors=True)
        chunk.drop(columns=["composition"], inplace=True)

        # Append chunk to file
        write_header = not os.path.exists(output_file)
        chunk.to_csv(output_file, mode='a', header=write_header, index=False)
        print(f"✅ Written chunk {i+1} to file")
    except Exception as e:
        print(f"❌ Error in chunk {i+1}: {e}")

print("🎉 All chunks complete. Final file saved:", output_file)


✅ Loaded dataset: (14704, 104)
🔍 Parsing formulas...


100%|█████████████████████████████████████████████████████████████████████████| 14704/14704 [00:00<00:00, 61268.12it/s]
  return bound(*args, **kwds)
In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


✅ Valid formulas: 13116/14704
✅ After base descriptor filtering: (13116, 105)
🔧 Total chunks: 5
🗂️ Output file cleared: valid_descriptor_dataset.csv


⚙️ Featurizing:   0%|                                                                            | 0/5 [00:00<?, ?it/s]

🔹 Processing chunk 1 of size 2624


ElementProperty:   0%|          | 0/2624 [00:00<?, ?it/s]

Stoichiometry:   0%|          | 0/2624 [00:00<?, ?it/s]

⚙️ Featurizing:  20%|█████████████▌                                                      | 1/5 [00:09<00:38,  9.58s/it]

✅ Written chunk 1 to file
🔹 Processing chunk 2 of size 2623


ElementProperty:   0%|          | 0/2623 [00:00<?, ?it/s]

Stoichiometry:   0%|          | 0/2623 [00:00<?, ?it/s]

⚙️ Featurizing:  40%|███████████████████████████▏                                        | 2/5 [00:19<00:30, 10.04s/it]

✅ Written chunk 2 to file
🔹 Processing chunk 3 of size 2623


ElementProperty:   0%|          | 0/2623 [00:00<?, ?it/s]

Stoichiometry:   0%|          | 0/2623 [00:00<?, ?it/s]

⚙️ Featurizing:  60%|████████████████████████████████████████▊                           | 3/5 [00:30<00:20, 10.37s/it]

✅ Written chunk 3 to file
🔹 Processing chunk 4 of size 2623


ElementProperty:   0%|          | 0/2623 [00:00<?, ?it/s]

Stoichiometry:   0%|          | 0/2623 [00:00<?, ?it/s]

⚙️ Featurizing:  80%|██████████████████████████████████████████████████████▍             | 4/5 [00:45<00:12, 12.27s/it]

✅ Written chunk 4 to file
🔹 Processing chunk 5 of size 2623


ElementProperty:   0%|          | 0/2623 [00:00<?, ?it/s]

Stoichiometry:   0%|          | 0/2623 [00:00<?, ?it/s]

⚙️ Featurizing: 100%|████████████████████████████████████████████████████████████████████| 5/5 [01:02<00:00, 12.54s/it]

✅ Written chunk 5 to file
🎉 All chunks complete. Final file saved: valid_descriptor_dataset.csv





In [7]:
import winsound

winsound.Beep(1000, 500)