In [None]:
import re

def clean_entity_text(entity):
    # Remove text after and including first digit
    match = re.search(r'\d', entity)
    if match:
        return entity[:match.start()].strip()
    else:
        return entity.strip()

def split_name(entity_string):
    if pd.isna(entity_string) or not isinstance(entity_string, str) or entity_string.strip() == '':

      return []

    entities = str(entity_string).split('^')
    parsed_entities = []

    for entity in entities:
        entity = clean_entity_text(entity)
        if not entity:
            continue

        entity = entity.strip()

        if ',' in entity:
            # Handle "Last, First Middle" case
            parts = entity.split(',', 1)
            last = parts[0].strip()
            first_middle = parts[1].strip()
        else:
            # Regular parsing
            words = entity.split()
            if not words:
                continue

            last_word = words[-1]
            if '.' in last_word:
                dot_parts = last_word.split('.')
                if len(dot_parts) == 2 and dot_parts[0] and dot_parts[1]:
                    first_middle = ' '.join(words[:-1] + [dot_parts[0]])
                    last = dot_parts[1]
                else:
                    first_middle = ' '.join(words[:-1])
                    last = last_word
            else:
                if len(words) == 1:
                    first_middle = ''
                    last = words[0]
                else:
                    first_middle = ' '.join(words[:-1])
                    last = words[-1]

        parsed_entities.append({'first_middle': first_middle, 'last': last, 'non_individual': np.nan })

    return parsed_entities

def split_non_individual(entity_string):
    if pd.isna(entity_string) or not isinstance(entity_string, str) or entity_string.strip() == '':

       return []

    entities = str(entity_string).split('^')
    parsed_entities = []

    for entity in entities:
        entity = entity.strip()
        if not entity:
            continue

        parsed_entities.append({
            'first_middle': np.nan,
            'last': np.nan,
            'non_individual': entity
        })

    return parsed_entities


def flatten_list_of_dicts(dict_list, party):

    flat_dict = {}
    for i, d in enumerate(dict_list):
        suffix = f'.{i}'
        for k, v in d.items():
            new_key = k
            flat_dict[f"{party}_{new_key}{suffix}"] = v
    return flat_dict

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

party = "buyer"
combined_df['BUYER NAME PARSED'] = combined_df['BUYER NAME'].apply(split_name)
combined_df['BUYER ORG PARSED'] = combined_df['BUYER ORG'].apply(split_non_individual)
combined_df['BUYER PARSED COMBINED'] = (combined_df['BUYER ORG PARSED'] + combined_df['BUYER NAME PARSED'])
combined_df['BUYER PARSED'] = combined_df['BUYER PARSED COMBINED'].apply(lambda x: flatten_list_of_dicts(x, party))

party = "seller"
combined_df['SELLER NAME PARSED'] = combined_df['SELLER NAME'].apply(split_name)
combined_df['SELLER ORG PARSED'] = combined_df['SELLER ORG'].apply(split_non_individual)
combined_df['SELLER PARSED COMBINED'] = (combined_df['SELLER ORG PARSED'] + combined_df['SELLER NAME PARSED'])
combined_df['SELLER PARSED'] = combined_df['SELLER PARSED COMBINED'].apply(lambda x: flatten_list_of_dicts(x, party))


In [None]:
parsed_buyer_df = pd.DataFrame(combined_df['BUYER PARSED'].tolist())
parsed_seller_df = pd.DataFrame(combined_df['SELLER PARSED'].tolist())

combined_df = pd.concat([combined_df.reset_index(drop=True), parsed_buyer_df.reset_index(drop=True), parsed_seller_df.reset_index(drop=True)], axis=1)



In [None]:

# empty_count = combined_df['SELLER PARSED'].apply(lambda x: isinstance(x, dict) and len(x) == 0).sum()
# print(f"Number of empty dictionaries: {empty_count}")

# empty_both = combined_df.apply(
#     lambda row: isinstance(row['SELLER NAME PARSED'], list) and len(row['SELLER NAME PARSED']) == 0 and
#                 isinstance(row['SELLER ORG PARSED'], list) and len(row['SELLER ORG PARSED']) == 0,
#     axis=1
# ).sum()

# print(combined_df['SELLER PARSED'])
name_cols = ['seller_first_middle.0', 'seller_first_middle.1']
org_cols = ['seller_non_individual.0', 'seller_non_individual.1','seller_non_individual.2']

non_empty_count = parsed_seller_df[org_cols].apply(
    lambda row: any((pd.notna(x) and str(x).strip() != '') for x in row),
    axis=1
).sum()


columns = ['buyer_first_middle.0', 'buyer_first_middle.1', 'buyer_non_individual.0']

combined_df['tutu'] = combined_df[columns].apply(lambda row: ' '.join(str(x).strip() for x in row if pd.notna(x) and str(x).strip() != ''), axis=1)
# blank_count = combined_df['tutu'].apply(lambda x: str(x).strip() == '').sum()
# print(f"Number of blank entries in 'tutu': {blank_count}")

print(combined_df[['buyer_first_middle.0', 'buyer_last.0', 'buyer_non_individual.0']].to_string())

In [None]:
buyer_name_cols = ['BuyerFirstMiddleName', 'BuyerLastName', 'BuyerFirstMiddleName.1', 'BuyerLastName.1', 'BuyerNonIndividualName', 'BuyerNonIndividualName.1' ]
seller_name_cols = ['SellerFirstMiddleName', 'SellerLastName', 'SellerFirstMiddleName.1', 'SellerLastName.1', 'SellerNonIndividualName', 'SellerNonIndividualName.1' ]
predicted_buyer_name_cols = ['buyer_first_middle.0', 'buyer_last.0','buyer_first_middle.1', 'buyer_last.1', 'buyer_non_individual.0', 'buyer_non_individual.1' ]
predicted_seller_name_cols = ['seller_first_middle.0', 'seller_last.0','seller_first_middle.1', 'seller_last.1', 'seller_non_individual.0', 'seller_non_individual.1' ]

buyer_name_cols = ['BuyerFirstMiddleName', 'BuyerLastName', 'BuyerFirstMiddleName.1', 'BuyerLastName.1' ]
seller_name_cols = ['SellerFirstMiddleName', 'SellerLastName', 'SellerFirstMiddleName.1', 'SellerLastName.1' ]

predicted_buyer_name_cols = ['buyer_first_middle.0', 'buyer_last.0','buyer_first_middle.1', 'buyer_last.1']
predicted_seller_name_cols = ['seller_first_middle.0', 'seller_last.0','seller_first_middle.1', 'seller_last.1']



def rows_with_any_match_lowercase(df, list1, list2, count_nan_matches=True):
    def row_match(row):

        for col1 in list1:
            for col2 in list2:
                val1 = row[col1]
                val2 = row[col2]
                if pd.notna(val1) and pd.notna(val2):
                    if str(val1).strip().lower() == str(val2).strip().lower():
                        return True

        if count_nan_matches:
            if all(pd.isna(row[col]) for col in list1) and all(pd.isna(row[col]) for col in list2):
                return True

        return False

    return df.apply(row_match, axis=1)


match_mask = rows_with_any_match_lowercase(combined_df, predicted_seller_name_cols, seller_name_cols)
df_no_match = combined_df[~match_mask]
print(match_mask.sum())
print(df_no_match[seller_name_cols + predicted_seller_name_cols].head(800).to_string())