In [None]:
# library installations if necessary, make sure you're using .venv!

%pip install pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
# Imports

import pandas as pd
import re

In [None]:
vt = pd.read_excel("ViolationTracker_21Aug2025_PHMSA_only.xlsx")
phmsa = pd.read_excel("PHMSA_RAW_DATA.xlsx")

# Helper function to make a numeric-only CPF key for PHMSA (e.g., "42025041NOA" -> "42025041")
def phmsa_cpf_key(s):
    if pd.isna(s):
        return ""
    m = re.match(r'\D*(\d+)', str(s))
    return m.group(1) if m else ""

phmsa["cpf_key"] = phmsa["CPF_Number"].apply(phmsa_cpf_key)

# Helper function that extracts CPF number from "info_source" column in GJF data.
# Try pattern 'cpf_123456789' first, else fallback to the longest digit run
def vt_cpf_from_info(url):
    if pd.isna(url):
        return ""
    txt = str(url)
    # This regex matches things like 'cpf_12345' or 'CPF-12345'
    m = re.search(r'(?i)cpf[_\-]?(\d{4,})', txt)   # case-insensitive, require >=4 digits
    if m:
        return m.group(1)
    # fallback: find all digit runs and return the longest (likely the CPF if present)
    runs = re.findall(r'(\d{4,})', txt)   # capture runs of 4+ digits
    if not runs:
        return ""
    # choose the longest run (if ties, first)
    runs_sorted = sorted(runs, key=lambda x: (-len(x), x))
    return runs_sorted[0]

vt["cpf_key_extracted"] = vt["info_source"].apply(vt_cpf_from_info)

# 3) Quick sanity counts
print("PHMSA distinct cpf_key count:", phmsa["cpf_key"].nunique())
print("GJF rows with extracted cpf_key:", (vt["cpf_key_extracted"] != "").sum())

# 4) Merge VT -> PHMSA on the cpf key
merged = vt.merge(phmsa, left_on="cpf_key_extracted", right_on="cpf_key", how="left", suffixes=("_vt", "_phmsa"))

# 5) Inspect mismatches
matched = merged[merged["CPF_Number"].notna()]
unmatched = merged[merged["CPF_Number"].isna()]
print("Matched rows:", len(matched))
print("Unmatched GJF rows after cpf merge:", len(unmatched))

# Save a sample of merged/unmatched for inspection
# merged.to_excel("VT_PHMSA_merged_by_cpf.xlsx", index=False)
# unmatched.to_excel("unmatched_rows.xlsx", index=False)


PHMSA distinct cpf_key count: 4944
GJF rows with extracted cpf_key: 718
Matched rows: 667
Unmatched GJF rows after cpf merge: 53


In [3]:
# Helper to normalize parent name
def normalize_parent(name):
    if pd.isna(name):
        return ""
    return re.sub(r'[^a-z0-9 ]', '', str(name).lower().strip())

# New columns for normalized names
matched["current_parent_name_n"] = matched["current_parent_name"].apply(normalize_parent)
matched["reporting_date_parent_n"] = matched["reporting_date_parent"].apply(normalize_parent)

# parent_changed is either True or False
matched["parent_changed"] = matched["current_parent_name_n"] != matched["reporting_date_parent_n"]

# Check counts
print(matched["parent_changed"].value_counts())

# Save to file for inspection
# matched.to_excel("VT_PHMSA_matched_with_parentchange.xlsx", index=False)


parent_changed
False    475
True     192
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched["current_parent_name_n"] = matched["current_parent_name"].apply(normalize_parent)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched["reporting_date_parent_n"] = matched["reporting_date_parent"].apply(normalize_parent)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched["parent_chang

In [None]:
# Assuming your dataframe is named `matched`
recap_to_date = {}  # cache dictionary

# Ensure the column exists even if some rows are blank
matched["acquisition_exact_date"] = None

# Iterate through unique, non-null recaps
unique_recaps = matched["history_recap"].dropna().unique()

for recap in unique_recaps:
    # Skip if we've already stored this one
    if recap in recap_to_date:
        continue

    print("\n-----------------------------")
    print(f"HISTORY RECAP:\n{recap}")
    date_input = input("Enter the exact acquisition/merger date (MM/DD/YYYY), or press Enter to skip: ").strip()

    if date_input:
        recap_to_date[recap] = date_input
    else:
        recap_to_date[recap] = None  # mark as skipped

# Apply to dataframe
matched["acquisition_exact_date"] = matched["history_recap"].map(recap_to_date)

# Save interim results
matched.to_excel("VT_PHMSA_with_acquisition_dates.xlsx", index=False)

print("\n✅ Done. Saved file with 'acquisition_exact_date' column.")
print(f"Total unique recaps processed: {len(recap_to_date)}")



-----------------------------
HISTORY RECAP:
Berkshire Hathaway acquired Dominion Energy Transmission, Inc. in November 2020


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched["acquisition_exact_date"] = None
