# Cleaning boiling points from PubChems PUG-view

Here the fetched bioling points for the compounds in the PubChemLite dataset be cleaned. 

In [None]:
%pip install rdkit

In [None]:
import pandas as pd
import numpy as np

import pickle
import re

In [None]:
with open('2025-04-22_PubChemLite_spectral_data.pkl', 'rb') as f:
    pcl = pickle.load(f)

In [None]:
pcl

In [None]:
pcl.head()

In [None]:
pcl_head = pcl.head(1000).copy()

In [None]:
numbers = [1,2,3,4]

mean_number = np.mean(numbers)
mean_number

In [None]:
#written by ChatGPT
import re
import numpy as np

number_pattern = re.compile(r'[-+]?\d*\.\d+|\d+')
pressure_pattern = re.compile(
    r'(?P<value>\d+\.?\d*)\s*(?P<unit>mm\s*hg|torr|atm|kpa|pa)',
    re.IGNORECASE
)

def fahrenheit_to_celsius(f):
    return (f - 32) * 5 / 9

def pressure_is_ok(value, unit):
    unit = unit.lower().replace(" ", "")
    value = float(value)

    if unit in ['mmhg', 'torr']:
        return 750 <= value <= 770
    elif unit == 'atm':
        return 0.95 <= value <= 1.05
    elif unit == 'kpa':
        return 99 <= value <= 103  # Roughly ±2%
    elif unit == 'pa':
        return 99000 <= value <= 103000  # Around 101325 Pa
    return False  # Unknown unit

def extract_boiling_point(entries):
    boiling_points = []

    for entry in entries:
        if not entry:
            continue

        # Extract number strings and convert to float
        numbers = number_pattern.findall(entry)
        float_numbers = list(map(float, numbers))

        # Filter to plausible temperature range
        plausible_values = [n for n in float_numbers if n != 1992]

        # If 'sublime' and no plausible values → skip
        if 'sublime' in entry.lower() and not plausible_values:
            continue

        # Check pressure
        pressure_match = pressure_pattern.search(entry)
        if pressure_match:
            pressure_val = pressure_match.group('value')
            pressure_unit = pressure_match.group('unit')
            if not pressure_is_ok(pressure_val, pressure_unit):
                continue

        # Fahrenheit?
        is_fahrenheit = '°f' in entry.lower()

        if plausible_values:
            if (re.search(r'\bto\b', entry.lower()) or re.search(r'\d\s*[-–]\s*\d', entry)) and len(plausible_values) >= 2:
                mean_value = np.mean(plausible_values[:2])
            else:
                mean_value = plausible_values[0]

            if is_fahrenheit:
                mean_value = fahrenheit_to_celsius(mean_value)

            boiling_points.append(mean_value)

    mean_boiling_point = np.mean(boiling_points) if boiling_points else np.nan

    return mean_boiling_point

In [None]:
pcl['boiling_point_matched_cleaned'] = pcl['boiling_point_matched'].apply(lambda x: extract_boiling_point(x) if isinstance(x, list) else np.nan)

pcl

In [None]:
pcl_visualize = pcl[['name', 'pubchem_cid', 'SMILES_pubchem', 'SMILES', 'ROMol','InChIKey', 'InChIKey14', 'XLogP_pubchem', 'gcms_spectra','lcms_spectra', 'boiling_point_matched', 'boiling_point_matched_cleaned', 'monoisotopic_mass']].copy()

In [None]:
with open('2025-04-29_pcl_visualization_dataframe.pkl', 'wb') as f:
    pickle.dump(pcl_visualize, f)