In [32]:
import requests

url = "https://www.atnf.csiro.au/research/pulsar/psrcat/glitchTbl.html"
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
with open('raw_page.html', 'w', encoding='utf-8') as f:
    f.write(response.text)



In [33]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Load the saved HTML file
with open('raw_page.html', 'r', encoding='utf-8') as f:
    html_content = f.read()

# Try multiple extraction methods
def extract_pulsar_data(html):
    # Method 1: Try pandas read_html
    try:
        tables = pd.read_html(html)
        if len(tables) > 0:
            return tables[0]  # Return first table found
    except:
        pass

    # Method 2: BeautifulSoup parsing
    soup = BeautifulSoup(html, 'html.parser')

    # Look for  tags (common for fixed-width data)
    pre = soup.find('pre')
    if pre:
        lines = [line.strip() for line in pre.text.split('\n') if line.strip()]

        # Find header line
        header_line = next((i for i, line in enumerate(lines)
                          if 'PSRJ' in line and 'GLEP' in line), None)

        if header_line is not None:
            # Process data lines
            data = []
            for line in lines[header_line+1:]:
                # Clean and split line
                clean_line = re.sub(r'\s+', ' ', line).strip()
                if re.match(r'^(J|B)\d', clean_line):  # Matches pulsar names
                    data.append(clean_line.split(None, 7))  # Split into max 8 columns

            if data:
                headers = ['PSR', 'PSRJ', 'GLEP (MJD)', 'Δν/ν (10^9)',
                          'Δν/ν\' (10^3)', 'Q', 'T_d (d)', 'Reference']
                return pd.DataFrame(data, columns=headers)

    # Method 3: Direct regex parsing
    # Method 3: Direct regex parsing
    table_pattern = re.compile(
        r'(J\d{4}\+\d{4}|B\d{4}\+\d{2})\s+'    # PSR
        r'(J\d{4}\+\d{4}|B\d{4}\+\d{2})\s+'    # PSRJ
        r'(\d+\.\d+|\d+)\s+'            # GLEP
        r'([\d\.]+?)\s+'               # Δν/ν
        r'([\d\.\-]+?|-)\s+'           # Δν/ν'
        r'([\d\.]+?|-)\s+'             # Q
        r'([\d\.]+?|-)\s+'             # T_d
        r'(\w+)'                               # Reference
    )
    matches = table_pattern.findall(html)
    if matches:
        headers = ['PSR', 'PSRJ', 'GLEP (MJD)', 'Δν/ν (10^9)',
                  'Δν/ν\' (10^3)', 'Q', 'T_d (d)', 'Reference']
        return pd.DataFrame(matches, columns=headers)

    return None

# Extract data
df = extract_pulsar_data(html_content)

if df is not None:
    # Clean the data
    df = df.replace({'': pd.NA, '-': pd.NA, ' ': pd.NA})
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip()

    # Save to CSV
    df.to_csv('pulsar_glitches_final.csv', index=False)
    print(f"Successfully extracted {len(df)} records")
    print(df.head())
else:
    print("Could not extract table data. Please try manual extraction.")

Successfully extracted 645 records
            0           1           2            3                          4  \
0         PSR        PSRJ  GLEP (MJD)  Δν/ν (10-9)  Δν&middot/ν&middot (10-3)   
1  J0007+7303  J0007+7303   54952.652     553.7(6)                    0.97(6)   
2    B0144+59  J0147+5922   53682(15)     0.056(3)                   -0.21(5)   
3    B0154+61  J0157+6212    58283(3)       2.6(3)                       <NA>   
4  J0146+6145  J0146+6145  51141(248)     650(150)                      14(5)   

      5        6          7  
0     Q  T_d (d)  Reference  
1  <NA>     <NA>     awd+12  
2  <NA>     <NA>     ywml10  
3  <NA>     <NA>     bsa+22  
4  <NA>     <NA>      mks05  


  tables = pd.read_html(html)


In [34]:
import pandas as pd
from io import StringIO
import re


# 1. Clean column headers
df.columns = ['PSR', 'PSRJ', 'GLEP_MJD', 'delta_nu_nu', 'delta_nudot_nudot',
              'Q', 'T_d_days', 'Reference']

# 2. Fix HTML entities in values
def clean_html_entities(text):
    if pd.isna(text):
        return text
    replacements = {
        '&middot': '',
        '&minus': '-',
        '&times': '×'
    }
    for k, v in replacements.items():
        text = str(text).replace(k, v)
    return text.strip()

df = df.applymap(clean_html_entities)

# 3. Standardize number formats (e.g., 553.7(6) → 553.7±0.6)
def clean_numbers(val):
    if pd.isna(val) or not re.match(r'.*[0-9]', str(val)):
        return val
    val = str(val).replace('(', '±').replace(')', '')
    return val

df['delta_nu_nu'] = df['delta_nu_nu'].apply(clean_numbers)
df['delta_nudot_nudot'] = df['delta_nudot_nudot'].apply(clean_numbers)

# 4. Handle missing values consistently
df.replace(['', '-', 'NA', 'NaN', 'nan'], pd.NA, inplace=True)

# 5. Save final CSV
df.to_csv('pulsar_glitches_clean.csv', index=False, encoding='utf-8-sig')

print("Final cleaned data sample:")
print(df.head())

Final cleaned data sample:
          PSR        PSRJ    GLEP_MJD delta_nu_nu delta_nudot_nudot     Q  \
0         PSR        PSRJ  GLEP (MJD)  Δν/ν ±10-9        Δν/ν ±10-3     Q   
1  J0007+7303  J0007+7303   54952.652     553.7±6            0.97±6  <NA>   
2    B0144+59  J0147+5922   53682(15)     0.056±3           -0.21±5  <NA>   
3    B0154+61  J0157+6212    58283(3)       2.6±3              <NA>  <NA>   
4  J0146+6145  J0146+6145  51141(248)     650±150              14±5  <NA>   

  T_d_days  Reference  
0  T_d (d)  Reference  
1     <NA>     awd+12  
2     <NA>     ywml10  
3     <NA>     bsa+22  
4     <NA>      mks05  


  df = df.applymap(clean_html_entities)


In [35]:
import pandas as pd
import re
import numpy as np

# Function to parse values with uncertainties (e.g., "553.7±6" or "0.0046(7)")
def parse_value_with_uncertainty(value_str):
    if pd.isna(value_str) or value_str == '<NA>':
        return None, None
    # Match formats like "553.7±6" or "0.0046(7)"
    match = re.match(r'([-]?\d*\.?\d*)\s*[±(](\d*\.?\d*)[)]?', value_str)
    if match:
        main_value = float(match.group(1))  # Main value (e.g., 553.7 or 0.0046)
        uncertainty = float(match.group(2))  # Uncertainty (e.g., 6 or 7)
        # For Q values with (X), scale uncertainty to the decimal place of main value
        if '(' in value_str:
            decimal_places = len(match.group(1).split('.')[-1]) if '.' in match.group(1) else 0
            uncertainty = uncertainty * (10 ** -decimal_places)
        return main_value, uncertainty
    return None, None

# Initialize arrays for non-NA values
q_values = []
dq_values = []
delta_nu_nu_values = []
d_delta_nu_nu_values = []
delta_nudot_nudot_values = []
d_delta_nudot_nudot_values = []

# Process each column
for _, row in df.iterrows():
    # Parse Q
    q, dq = parse_value_with_uncertainty(row['Q'])
    if q is not None:
        q_values.append(q)
        dq_values.append(dq)
    
    # Parse delta_nu_nu
    delta_nu, d_delta_nu = parse_value_with_uncertainty(row['delta_nu_nu'])
    if delta_nu is not None:
        delta_nu_nu_values.append(delta_nu)
        d_delta_nu_nu_values.append(d_delta_nu)
    
    # Parse delta_nudot_nudot
    delta_nudot, d_delta_nudot = parse_value_with_uncertainty(row['delta_nudot_nudot'])
    if delta_nudot is not None:
        delta_nudot_nudot_values.append(delta_nudot)
        d_delta_nudot_nudot_values.append(d_delta_nudot)

# Convert lists to numpy arrays for convenience
q_values = np.array(q_values)
dq_values = np.array(dq_values)
delta_nu_nu_values = np.array(delta_nu_nu_values)
d_delta_nu_nu_values = np.array(d_delta_nu_nu_values)
delta_nudot_nudot_values = np.array(delta_nudot_nudot_values)
d_delta_nudot_nudot_values = np.array(d_delta_nudot_nudot_values)

# Print the arrays
print("q_values:", q_values)
print("dq_values:", dq_values)
print("delta_nu_nu_values:", delta_nu_nu_values)
print("d_delta_nu_nu_values:", d_delta_nu_nu_values)
print("delta_nudot_nudot_values:", delta_nudot_nudot_values)
print("d_delta_nudot_nudot_values:", d_delta_nudot_nudot_values)

# Optional: Save arrays to a file (e.g., CSV or numpy file)
# Save as CSV

# Save as numpy file
np.savez('extracted_values.npz', 
         q=q_values, dq=dq_values, 
         delta_nu_nu=delta_nu_nu_values, d_delta_nu_nu=d_delta_nu_nu_values,
         delta_nudot_nudot=delta_nudot_nudot_values, d_delta_nudot_nudot=d_delta_nudot_nudot_values)

q_values: [1.100e+00 7.700e-01 1.170e-03 6.000e-01 4.400e-01 6.000e-01 8.000e-01
 5.360e-01 1.000e+00 8.900e-01 8.940e-01 8.700e-01 8.000e-01 6.800e-01
 8.700e-01 9.000e-01 8.000e-01 6.200e-01 1.300e-01 6.000e-03 1.980e-03
 1.782e-02 1.580e-03 1.311e-02 1.612e-01 4.350e-04 3.534e-03 2.420e-03
 1.134e-02 8.130e-04 1.900e-03 2.483e-03 5.500e-03 3.700e-03 1.541e-01
 5.385e-03 1.684e-01 3.000e-02 8.800e-03 5.470e-03 6.691e-03 2.000e-02
 9.000e-03 1.190e-02 5.480e-03 2.700e-02 5.460e-03 6.800e-03 7.800e-03
 5.000e-03 5.000e-03 1.140e-02 2.600e-02 8.000e-03 4.000e-03 5.700e-03
 2.200e-02 1.400e-02 8.400e-01 8.100e-01 2.140e-01 2.600e-03 4.000e-03
 4.900e-03 3.600e-01 1.600e-02 1.120e-02 4.200e-02 1.020e-01 3.000e-02
 2.630e-03 8.000e-03 7.000e-03 9.000e-03 1.200e-01 1.400e+00 5.800e-03
 7.000e-03 9.750e-01 5.000e-03 6.100e-03 4.000e-03 2.300e-02 2.000e-03
 5.300e-03 9.700e-01 1.748e-02 1.290e-02 8.490e-03 6.000e-03 6.100e-03
 6.000e-05 7.700e-03 1.020e-02 7.000e-03 7.300e-02 1.250e-01 1.400e

In [55]:
np.percentile(delta_nu_nu_values, 5)*1e-9, np.percentile(delta_nu_nu_values, 95)*1e-9

(2.0000000000000003e-10, 4.47527e-06)

In [62]:
np.percentile(delta_nudot_nudot_values, 5)*1e-3, np.percentile(delta_nudot_nudot_values, 95)*1e-3

(-0.002, 0.0365)

In [63]:
np.percentile(np.abs(delta_nudot_nudot_values), 5)*1e-3, np.percentile(np.abs(delta_nudot_nudot_values), 95)*1e-3

(0.0001, 0.048)