In [2]:
import json
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from spacy.training import Example

nlp = spacy.load("en_core_web_sm")

with open("spacey_train.json", "r", encoding="utf-8") as f:
    data = json.load(f)

if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

existing_labels = set(ner.labels)

for _, annotations in data:
    for _, _, label in annotations["entities"]:
        if label not in existing_labels:
            ner.add_label(label)

db = DocBin()
examples = []

for text, annotations in data:
    doc = nlp.make_doc(text)
    ents = []

    for start, end, label in annotations["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            ents.append(span)

    doc.ents = filter_spans(ents)
    db.add(doc)

    examples.append(Example.from_dict(doc, {"entities": annotations["entities"]}))

db.to_disk("./train.spacy")
print("Saved spaCy training data to train.spacy")

optimizer = nlp.resume_training()

for epoch in range(50):
    losses = {}
    nlp.update(examples, drop=0.3, losses=losses)
    print(f"Epoch {epoch + 1} | Loss: {losses['ner']}")

nlp.to_disk("./fine_tuned_spacy_model")
print("Fine-tuned model saved successfully")


✅ Saved spaCy training data to train.spacy
Epoch 1 | Loss: 17566.6953125
Epoch 2 | Loss: 16094.8974609375
Epoch 3 | Loss: 14443.509765625
Epoch 4 | Loss: 13126.1953125
Epoch 5 | Loss: 11451.7568359375
Epoch 6 | Loss: 10287.349609375
Epoch 7 | Loss: 9224.5830078125
Epoch 8 | Loss: 8250.703125
Epoch 9 | Loss: 8004.81005859375
Epoch 10 | Loss: 7485.41650390625
Epoch 11 | Loss: 7076.55419921875
Epoch 12 | Loss: 6972.62109375
Epoch 13 | Loss: 6839.96044921875
Epoch 14 | Loss: 6464.2705078125
Epoch 15 | Loss: 6232.05908203125
Epoch 16 | Loss: 5878.30859375
Epoch 17 | Loss: 5883.94189453125
Epoch 18 | Loss: 5373.39111328125
Epoch 19 | Loss: 5018.8134765625
Epoch 20 | Loss: 5014.2021484375
Epoch 21 | Loss: 4667.16455078125
Epoch 22 | Loss: 4555.33447265625
Epoch 23 | Loss: 4276.97802734375
Epoch 24 | Loss: 4180.93603515625
Epoch 25 | Loss: 4128.7529296875
Epoch 26 | Loss: 3887.052001953125
Epoch 27 | Loss: 3938.01611328125
Epoch 28 | Loss: 3829.95654296875
Epoch 29 | Loss: 3520.87548828125
Epo

In [7]:
import json
import spacy
from spacy.scorer import Scorer
from spacy.training.example import Example

model_path = "./fine_tuned_spacy_model"
nlp = spacy.load(model_path)
baseline_nlp = spacy.load("en_core_web_sm")


test_file = "/content/spacey_test.json"
with open(test_file, "r", encoding="utf-8") as f:
    test_data = json.load(f)

for item in test_data:
    text, annotations = item
    doc = nlp(text)

def evaluate_model(nlp, test_data):
    scorer = Scorer()
    examples = []

    for item in test_data:
        text, annotations = item
        gold_annotations = annotations["entities"]

        doc = nlp(text)

        gold_doc = nlp.make_doc(text)
        ents = []
        for start, end, label in gold_annotations:
            span = gold_doc.char_span(start, end, label=label)
            if span:
                ents.append(span)

        gold_doc.ents = ents

        example = Example(predicted=doc, reference=gold_doc)
        examples.append(example)

    scores = scorer.score(examples)
    return scores

print("\n--- EVALUATING BASELINE MODEL (en_core_web_sm) ---\n")
baseline_scores = evaluate_model(baseline_nlp, test_data)
print(json.dumps(baseline_scores, indent=4))

print("\n--- EVALUATING FINE-TUNED MODEL ---\n")
custom_scores = evaluate_model(nlp, test_data)
print(json.dumps(custom_scores, indent=4))




--- EVALUATING BASELINE MODEL (en_core_web_sm) ---

{
    "token_acc": 1.0,
    "token_p": 1.0,
    "token_r": 1.0,
    "token_f": 1.0,
    "sents_p": 1.0,
    "sents_r": 1.0,
    "sents_f": 1.0,
    "tag_acc": null,
    "pos_acc": null,
    "morph_acc": null,
    "morph_micro_p": null,
    "morph_micro_r": null,
    "morph_micro_f": null,
    "morph_per_feat": null,
    "dep_uas": null,
    "dep_las": null,
    "dep_las_per_type": null,
    "ents_p": 0.13087557603686636,
    "ents_r": 0.25357142857142856,
    "ents_f": 0.17264437689969603,
    "ents_per_type": {
        "TIME": {
            "p": 0.0,
            "r": 0.0,
            "f": 0.0
        },
        "CARDINAL": {
            "p": 0.0,
            "r": 0.0,
            "f": 0.0
        },
        "DATE": {
            "p": 0.0,
            "r": 0.0,
            "f": 0.0
        },
        "PERSON": {
            "p": 0.0,
            "r": 0.0,
            "f": 0.0
        },
        "ORG": {
            "p": 0.44612794612

In [10]:
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from rapidfuzz import process, fuzz
import re
import string

def search_sec_edgar_company(name):
    name = name.replace("'", "")
    """ Fetch SEC EDGAR data for a given company name """
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
    params = {"company": name, "count": 100, "action": "getcompany"}
    headers = {"User-Agent": "CompanyLookup/1.0 (contact@email.com)"}

    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code == 200:
        return response.text
    return None

CORROBORATION_SCORES = {
    "FULLY_CORROBORATED": 0,
    "PARTIALLY_CORROBORATED": 1,
    "ENTITY_SUPPLIED_ONLY": 2
}

CONFORMITY_SCORES = {
    "CONFORMING": 0,
    "NON_CONFORMING": 2,
    "NOT_APPLICABLE": 1
}

sic_sector_map = {
    1: "Agricultural Production - Crops",
    2: "Agricultural Production - Livestock",
    7: "Agricultural Services",
    8: "Forestry",
    9: "Fishing, Hunting, and Trapping",
    10: "Metal Mining",
    12: "Coal Mining",
    13: "Oil and Gas Extraction",
    14: "Mining and Quarrying of Nonmetallic Minerals",
    15: "General Building Contractors",
    16: "Heavy Construction Contractors",
    17: "Special Trade Contractors",
    20: "Food and Kindred Products",
    21: "Tobacco Products",
    22: "Textile Mill Products",
    23: "Apparel and Other Textile Products",
    24: "Lumber and Wood Products",
    25: "Furniture and Fixtures",
    26: "Paper and Allied Products",
    27: "Printing and Publishing",
    28: "Chemicals and Allied Products",
    29: "Petroleum Refining",
    30: "Rubber and Miscellaneous Plastics",
    31: "Leather and Leather Products",
    32: "Stone, Clay, and Glass Products",
    33: "Primary Metal Industries",
    34: "Fabricated Metal Products",
    35: "Industrial Machinery and Equipment",
    36: "Electronic and Electrical Equipment",
    37: "Transportation Equipment",
    38: "Measuring Instruments & Optical Goods",
    39: "Miscellaneous Manufacturing Industries",
    40: "Railroad Transportation",
    41: "Local and Interurban Passenger Transit",
    42: "Trucking and Warehousing",
    43: "U.S. Postal Service",
    44: "Water Transportation",
    45: "Air Transportation",
    46: "Pipelines, Except Natural Gas",
    47: "Transportation Services",
    48: "Communications",
    49: "Electric, Gas, and Sanitary Services",
    50: "Wholesale Trade - Durable Goods",
    51: "Wholesale Trade - Nondurable Goods",
    52: "Building Materials & Garden Supplies",
    53: "General Merchandise Stores",
    54: "Food Stores",
    55: "Automotive Dealers and Service Stations",
    56: "Apparel and Accessory Stores",
    57: "Furniture and Home Furnishings Stores",
    58: "Eating and Drinking Places",
    59: "Miscellaneous Retail",
    60: "Depository Institutions (Banks, Credit Unions)",
    61: "Non-Depository Credit Institutions",
    62: "Security & Commodity Brokers, Dealers",
    63: "Insurance Carriers",
    64: "Insurance Agents, Brokers, and Service",
    65: "Real Estate",
    67: "Holding and Other Investment Offices",
    70: "Hotels and Other Lodging Places",
    72: "Personal Services",
    73: "Business Services",
    75: "Automotive Repair, Services, and Parking",
    76: "Miscellaneous Repair Services",
    78: "Motion Pictures",
    79: "Amusement and Recreation Services",
    80: "Health Services",
    81: "Legal Services",
    82: "Educational Services",
    83: "Social Services",
    84: "Museums, Botanical Gardens, and Zoos",
    86: "Membership Organizations",
    87: "Engineering & Management Services",
    88: "Private Households",
    89: "Services, Not Elsewhere Classified",
    91: "Executive, Legislative, and General Government",
    92: "Justice, Public Order, and Safety",
    93: "Finance, Taxation, and Monetary Policy",
    94: "Administration of Human Resources",
    95: "Environmental Quality and Housing Programs",
    96: "Administration of Economic Programs",
    97: "National Security and International Affairs"
}


def get_lei_info(company):
    url1 = f"https://api.gleif.org/api/v1/lei-records?filter[owns]={company}&page[number]=1&page[size]=50"
    headers = {'Accept': 'application/vnd.api+json'}

    response1 = requests.get(url1, headers=headers)
    ob = response1.json()

    if not ob.get('data'):
        return pd.DataFrame(columns=[
            "company_name", "headquarters_country", "conformity_flag", "conformity_score",
            "corroboration_level", "corroboration_score", "ocid", "cik",
            "parent_company_name", "parent_headquarters", "parent_conformity_flag", "parent_conformity_score",
            "parent_corroboration_level", "parent_corroboration_score", "parent_ocid", "parent_cik"
        ])

    results = []

    for item in ob['data']:
        try:
            company_name = item['attributes']['entity']['legalName']['name']
            headquarters_country = item['attributes']['entity']['headquartersAddress']['country']
            conformity_flag = item['attributes']['conformityFlag']
            ocid = item['attributes'].get('ocid', None)
            corroboration_level = item['attributes']['registration']['corroborationLevel']

            conformity_score = CONFORMITY_SCORES.get(conformity_flag, -1)
            corroboration_score = CORROBORATION_SCORES.get(corroboration_level, -1)

            parent_company_name, parent_headquarters = company_name, headquarters_country
            parent_conformity_flag, parent_corroboration_level, parent_ocid = conformity_flag, corroboration_level, ocid
            parent_conformity_score, parent_corroboration_score = conformity_score, corroboration_score

            parent_data = item.get('relationships', {}).get('ultimate-parent', {}).get('links', {})

            if "reporting-exception" in parent_data:
                pass
            elif "lei-record" in parent_data:
                parent_url = parent_data["lei-record"]
                parent_response = requests.get(parent_url, headers=headers)
                parent_ob = parent_response.json()

                if parent_ob.get('data'):
                    parent_company_name = parent_ob['data']['attributes']['entity']['legalName']['name']
                    parent_headquarters = parent_ob['data']['attributes']['entity']['headquartersAddress']['country']
                    parent_conformity_flag = parent_ob['data']['attributes']['conformityFlag']
                    parent_ocid = parent_ob['data']['attributes'].get('ocid', None)
                    parent_corroboration_level = parent_ob['data']['attributes']['registration']['corroborationLevel']

                    parent_conformity_score = CONFORMITY_SCORES.get(parent_conformity_flag, -1)
                    parent_corroboration_score = CORROBORATION_SCORES.get(parent_corroboration_level, -1)


            results.append({
                "company_name": company_name,
                "headquarters_country": headquarters_country,
                "conformity_flag": conformity_flag,
                "conformity_score": conformity_score,
                "corroboration_level": corroboration_level,
                "corroboration_score": corroboration_score,
                "ocid": ocid,
                "parent_company_name": parent_company_name,
                "parent_headquarters": parent_headquarters,
                "parent_conformity_flag": parent_conformity_flag,
                "parent_conformity_score": parent_conformity_score,
                "parent_corroboration_level": parent_corroboration_level,
                "parent_corroboration_score": parent_corroboration_score,
                "parent_ocid": parent_ocid,
            })

        except (KeyError, TypeError):
            continue
    final_result = pd.DataFrame(results)
    return final_result

def extract_companies_to_dataframe(html_content):
    """ Parses SEC HTML response and converts company data into a Pandas DataFrame """
    soup = BeautifulSoup(html_content, "html.parser")
    company_table = soup.find("table", class_="tableFile2")

    data = []
    if company_table:
        rows = company_table.find_all("tr")[1:]
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 2:
                cik = cols[0].text.strip()
                name = cols[1].text.strip()
                match = re.search(r"^(.*?)\s*SIC:\s*(\d+)\s*-\s*(.+)", name)

                if match:
                    name = match.group(1).strip()
                    sic_code = match.group(2)
                    industry = match.group(3).strip()
                else:
                    sic_code = None
                    industry = None

                if re.search(r"\b(19|20)\d{2}\b", name):
                    continue

                data.append({"company_name": name, "cik": cik, "sic_code": sic_code, "industry": industry, "fetch_from": "SEC-EDGAR"})
    return pd.DataFrame(data)

def search_sec_company(partial_name):
    partial_name = partial_name.replace("'", "")
    url = f"https://efts.sec.gov/LATEST/search-index?entityName={partial_name}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    companies_list = []
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        if "hits" in data:
            results = data["hits"]
            results = results['hits']
            all_names = list({
                name.strip().replace("  ", " ")
                for item in results
                for name in item["_source"].get("display_names", [])
            })
            pattern = re.compile(r"(.+?) \(CIK (\d+)\)")

            result = [
                {"company_name": match.group(1), "cik": match.group(2), "sic_code": None, "industry": None, "fetch_from": "SEC_EFT"}
                for entry in all_names if (match := pattern.match(entry))
            ]
            frame = pd.DataFrame(result)
            return frame
        else:
            print("No results found.")
    else:
        print(f"Error: {response.status_code}")

def fuzzy_match_all_companies_sec(user_input, df):
    """ Returns a DataFrame of all companies with SIC codes, sector labels, and match scores """

    user_input_lower = user_input.lower()
    df["company_name_lower"] = df["company_name"].str.lower()

    df["match_score"] = df["company_name_lower"].apply(
        lambda x: 0.4 * fuzz.QRatio(user_input_lower, x) +
                  0.3 * fuzz.token_sort_ratio(user_input_lower, x) +
                  0.3 * fuzz.partial_ratio(user_input_lower, x)
    )

    df["sic_code"] = pd.to_numeric(df["sic_code"], errors="coerce")

    df["sic_code_2digit"] = df["sic_code"].dropna().astype(int).astype(str).str[:2]
    df["sic_code_2digit"] = pd.to_numeric(df["sic_code_2digit"], errors="coerce")

    df["Sector"] = df["sic_code_2digit"].map(sic_sector_map)

    df = df.sort_values(by=["sic_code_2digit" , "match_score"], ascending=False)

    df_with_sic = df[df["sic_code"].notna()]
    df_without_sic = df[df["sic_code"].isna()]
    df_sec_eft = df[df["fetch_from"] == 'SEC_EFT']
    top_with_sic = df_with_sic.nlargest(2, "match_score")
    top_without_sic = df_without_sic.nlargest(2, "match_score")
    top_in_sec_eft = df_sec_eft.nlargest(2, "match_score")
    final_results = pd.concat([top_with_sic, top_without_sic, top_in_sec_eft])
    final_results['info_count'] = final_results.notna().sum(axis=1)

    final_results = final_results.sort_values(by=['cik', 'info_count'], ascending=[True, False]).drop_duplicates(
        subset='cik', keep='first')

    final_results = final_results.drop(columns=['info_count'])
    return final_results

def get_top_values_between_sec_and_lei(p, k):
    """Fuzzy match the top row of p with k, and vice versa, adding match scores and final averages."""
    p['final_average'] = (p['average'])
    k['final_average'] = (k['match_score'])

    return p, k  #

def add_match_and_parent_count(df, user_input):
    user_input_lower = user_input.lower()

    df["match_score"] = df["company_name"].apply(
        lambda x: 0.4 * fuzz.QRatio(user_input_lower, x.lower()) +
                  0.3 * fuzz.token_sort_ratio(user_input_lower, x.lower()) +
                  0.3 * fuzz.partial_ratio(user_input_lower, x.lower())
    )

    parent_counts = df["parent_company_name"].value_counts()
    df["parent_count"] = df["parent_company_name"].map(parent_counts)


    df["parent_percentage"] = (df["parent_count"] / len(df)) * 100
    df['equality_score'] = 0.00  # Default
    df.loc[df['company_name'] == df['parent_company_name'], 'equality_score'] = 100.00
    df['average'] = (0.25 * df['parent_percentage'] + 0.5 * df['match_score'] + 0.25 * df['equality_score'])
    df.loc[df['match_score'] > 70, 'average'] = (0.7 * df['match_score'] + 0.3 * df['parent_percentage'])

    df = df.sort_values(by=['average'], ascending = False )
    return df

def normalize_name(name):
    """Convert to lowercase and remove spaces for robust matching."""
    name = name.lower()
    name = re.sub(r'\b(corp|corporation|inc|ltd|llc|co)\b', '', name)
    name = re.sub(r'\[.*?\]|\(.*?\)', '', name)
    name = name.replace('.', " ")
    name = name.replace (",", " ")
    name = re.sub(r'\s+', ' ', name).strip()
    return name

def merge_ranked_dataframes(p, k):
    p["normalized_name"] = p["company_name"].apply(normalize_name)
    k["normalized_name"] = k["company_name"].apply(normalize_name)

    p = p.sort_values(by="match_score", ascending=False, ignore_index=True) if "match_score" in p else p
    k = k.sort_values(by="match_score", ascending=False, ignore_index=True) if "match_score" in k else k

    final_df = pd.DataFrame()

    if not p.empty and not k.empty and p.loc[0, "normalized_name"] == k.loc[0, "normalized_name"]:
        final_df = pd.merge(p.loc[[0]], k.loc[[0]], on="company_name", how="outer", suffixes=("_p", "_k"))
        final_df["confidence_score"] = 1.0
    else:
        common_names = set(p["normalized_name"]).intersection(set(k["normalized_name"]))
        for name in common_names:
            p_match = p[p["normalized_name"] == name]
            k_match = k[k["normalized_name"] == name]
            if not p_match.empty and not k_match.empty:
                if (60 <= p_match.iloc[0]["match_score"] <= 100) and (60 <= k_match.iloc[0]["match_score"] <= 100):
                    final_df = pd.merge(p_match, k_match, on="company_name", how="outer", suffixes=("_p", "_k"))
                    final_df["confidence_score"] = 0.8
                    break


    if final_df.empty:
        top_p = p.iloc[0] if not p.empty else None
        top_k = k.iloc[0] if not k.empty else None

        if top_p is not None and top_k is not None:
            final_row = top_p if top_p["match_score"] >= top_k["match_score"] else top_k
        elif top_p is not None:
            final_row = top_p
        else:
            final_row = top_k

        final_df = pd.DataFrame([final_row])
        final_df["confidence_score"] = final_df["match_score"].apply(lambda x: 1.0 if x > 80 else (0.8 if 60 <= x <= 80 else 0.6))

    all_columns = ["company_name"] + [col for col in (list(p.columns) + list(k.columns)) if col != "company_name"]
    if {"normalized_name_p", "normalized_name_k"}.issubset(final_df.columns):
        final_df["normalized_name"] = final_df[["normalized_name_p", "normalized_name_k"]].bfill(axis=1).iloc[:, 0]
        final_df.drop(columns=["normalized_name_p", "normalized_name_k"], errors="ignore", inplace=True)
    final_df = final_df.reindex(columns=all_columns + ["confidence_score"], fill_value=None)
    cols_to_drop = [col for col in final_df.columns if
                    col.startswith(('match_score', 'parent_count', 'parent_percentage',
                                    'equality_score', 'average', 'final_average',
                                    'company_name_lower'))]
    final_df.drop(columns=cols_to_drop, errors="ignore", inplace=True)
    final_df['normalized_name'] = final_df["normalized_name"] = final_df["normalized_name"].astype(str)
    final_df = final_df.groupby(axis=1, level=0).first()

    if len(final_df) == 1:
        merged_row = final_df.iloc[0]

    elif len(final_df) == 2:
        merged_row = final_df.iloc[0].copy()

        for col in final_df.columns:
            val1, val2 = final_df.iloc[0][col], final_df.iloc[1][col]

            if str(val1) == str(val2):
                merged_row[col] = val1
            elif pd.isna(val1):
                merged_row[col] = val2
            elif pd.isna(val2):
                merged_row[col] = val1
            elif isinstance(val1, str) and isinstance(val2, str):
                merged_row[col] = val1 if len(val1) >= len(val2) else val2

        final_df = pd.DataFrame([merged_row])

    return final_df

def get_company_info(company):
    print(f"Fetching SEC-EDGAR and LEI info for {company}")
    final_company = pd.DataFrame()
    sec_df = pd.DataFrame()
    html_content = search_sec_edgar_company(company)
    df = extract_companies_to_dataframe(html_content)
    df = pd.concat([df, search_sec_company(company)])
    info = fuzzy_match_all_companies_sec(company, df)

    if isinstance(info, pd.DataFrame):
        k = pd.concat([sec_df, info], ignore_index=True)

        p = get_lei_info(company)
        p = add_match_and_parent_count(p, company)

        if len(k) == 0:
            p_top = p.loc[p['average'].idxmax()].copy()

            if p_top['match_score'] > 80:
                p_top.loc['confidence_on_entity'] = 0.8
            elif 60 < p_top['match_score'] <= 80:
                p_top.loc['confidence_on_entity'] = 0.6
            else:
                print("There's no match here")
                return final_company

            all_columns = ["company_name"] + [col for col in (list(p.columns) + list(k.columns)) if
                                              col != "company_name"]
            final_company = pd.DataFrame([p_top]).reindex(columns=all_columns + ["confidence_on_entity"],
                                                          fill_value=None)
            cols_to_drop = [col for col in final_company.columns if
                            col.startswith(('match_score', 'parent_count', 'parent_percentage',
                                            'equality_score', 'average', 'final_average',
                                            'company_name_lower'))]
            final_company.drop(columns=cols_to_drop, errors="ignore", inplace=True)

            return final_company

        elif len(p) == 0:
            k_top = k.loc[k['match_score'].idxmax()].copy()

            if k_top['match_score'] > 80:
                k_top.loc['confidence_on_entity'] = 0.8
            elif 60 < k_top['match_score'] <= 80:
                k_top.loc['confidence_on_entity'] = 0.6
            else:
                print("There's no match here")

            all_columns = ["company_name"] + [col for col in (list(p.columns) + list(k.columns)) if
                                              col != "company_name"]
            final_company = pd.DataFrame([k_top]).reindex(columns=all_columns + ["confidence_on_entity"],
                                                          fill_value=None)
            cols_to_drop = [col for col in final_company.columns if
                            col.startswith(('match_score', 'parent_count', 'parent_percentage',
                                            'equality_score', 'average', 'final_average',
                                            'company_name_lower'))]
            final_company.drop(columns=cols_to_drop, errors="ignore", inplace=True)
            return final_company

        else:
            p, k = get_top_values_between_sec_and_lei(p, k)

        final_company = merge_ranked_dataframes(p,k)
        return final_company


In [13]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import wikipedia
from urllib.parse import unquote
import re

finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
classifier = pipeline("text-classification", model=finbert_model, tokenizer=finbert_tokenizer)

from rapidfuzz import fuzz, process

def check_sanctions(company_name, result):
    print("Verifyin sanctions...")
    df = pd.read_csv('/content/sdn.csv', encoding='latin-1')
    match_found = any(company_name.lower() in str(name).lower() for name in df["SDN Name"])
    result["Sanctioned (OFAC)"] = match_found
    result["Sanctions URL"] = "https://sanctionssearch.ofac.treasury.gov/"
    result["Sanctioned (OFAC) Match"] = ""

    if match_found:
        result["Sanctioned (OFAC) Match"] = next(
            (str(name) for name in df["SDN Name"] if company_name.lower() in str(name).lower()), ""
        )

    # Check the second sanctions file with fuzzy matching on "caption"
    try:
        df_sanctions = pd.read_csv("/content/sanctions.csv", encoding="utf-8")
        captions = df_sanctions["caption"].dropna().astype(str).tolist()
        match, score, _ = process.extractOne(company_name, captions, scorer=fuzz.token_sort_ratio)

        result["Sanctioned (Other)"] = score >= 90
        result["Sanctioned (Other) Match"] = match if score >= 90 else ""
        result["Sanctioned (Other) URL"] = "https://www.opensanctions.org/datasets/default/"  # optional
    except Exception as e:
        result["Sanctioned (Other)"] = False
        result["Sanctioned (Other) Match"] = ""



# ========== PANAMA PAPERS CHECK ==========
def check_panama_papers(company_name, result):
    print("Verifying Panama Papers...")
    search_url = f"https://offshoreleaks.icij.org/search?q={company_name.replace(' ', '+')}"
    r = requests.get(search_url, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")

    # Look for exact match in the result rows
    found = False
    for tag in soup.select("div.search-results div.result h4"):
        name_text = tag.get_text(strip=True)
        if name_text.lower() == company_name.lower():
            found = True
            break

    if found:
        result["In Panama Papers"] = True
        result["Panama Leak URL"] = search_url


# ========== YAHOO NEWS LINKS ==========
def extract_real_url(yahoo_redirect_url):
    # Extracts the value of RU=... before /RK= or /RS=
    match = re.search(r"RU=(.+?)/(RK|RS|RZ)=", yahoo_redirect_url)
    if match:
        encoded_url = match.group(1)
        return unquote(encoded_url)
    return None

def get_yahoo_news_headlines(company_name, pages=3):
    headers = {"User-Agent": "Mozilla/5.0"}
    all_html = ""
    print("Verifying Yahoo News...")
    # Fetch Yahoo News results for multiple pages
    for page in range(pages):
        start = 1 + page * 10  # page 1: b=1, page 2: b=11, etc.
        query = company_name.replace(" ", "+")
        url = f"https://news.search.yahoo.com/search?p={query}&b={start}"
        res = requests.get(url, headers=headers)
        if res.status_code == 200:
            all_html += res.text
        else:
            print(f"Failed to fetch page {page + 1}")

    # Parse headlines from combined HTML
    soup = BeautifulSoup(all_html, "html.parser")
    headlines = []

    for h4 in soup.find_all("h4", class_="s-title"):
        a = h4.find("a")
        if a and a.text:
            headlines.append(a.get_text(strip=True))

    return headlines

# ========== SENTIMENT ANALYSIS ==========
def analyze_sentiment_with_finbert(company_name, result):
    headlines = get_yahoo_news_headlines(company_name)
    sentiment_map = {"positive": 1, "neutral": 0, "negative": -1}
    sentiment_scores = []
    print("Performing sentiment analysis...")
    for headline in headlines:
        try:
            output = classifier(headline)[0]
            label = output["label"].lower()
            score = round(output["score"], 3)
            sentiment_scores.append(sentiment_map[label])
            result["News Articles"].append({
                "headline": headline,
                "sentiment": label,
                "confidence": score
            })
        except Exception as e:
            continue

    if sentiment_scores:
        result["Average Sentiment Score"] = round(sum(sentiment_scores) / len(sentiment_scores), 3)
        result["Mentioned in Negative News"] = any(s < 0 for s in sentiment_scores)

# ========== NGO DETECTION ==========
def check_if_ngo(company_name, result):
    print("Verifying if NGO...")
    keywords = ['foundation', 'association', 'ngo', 'non-governmental', 'charity', 'relief', 'aid', 'humanitarian', 'mission', 'trust', 'society']
    name_lower = company_name.lower()
    result["Likely NGO"] = any(word in name_lower for word in keywords)

    try:
        summary = wikipedia.summary(company_name, sentences=2).lower()
        if any(word in summary for word in keywords):
            result["Likely NGO"] = True
            page_url = wikipedia.page(company_name).url
            result["NGO Wiki URL"] = page_url
    except Exception as e:
        pass  # If no page or summary found, just skip

def check_in_warrants_list(company_name, result, filepath="/content/warrants.txt"):
    print("Verifying warrants...")
    warrants = pd.read_fwf(filepath, header=None, names=["name"])
    names_list = warrants["name"].dropna().astype(str).tolist()
    match, score, _ = process.extractOne(company_name, names_list, scorer=fuzz.token_sort_ratio)

    result["In_Warrants_List"] = score >= 90
    result["Warrants_Matched_Name"] = match if score >= 90 else ""
    result["Warrants_Source"] = "OpenSanctions (warrants.txt)"

def check_in_regulatory_list(company_name, result, filepath="/content/regulatory.txt"):
    print("Verifying regulatory risks...")
    regulatory = pd.read_fwf(filepath, header=None, names=["name"])
    names_list = regulatory["name"].dropna().astype(str).tolist()
    match, score, _ = process.extractOne(company_name, names_list, scorer=fuzz.token_sort_ratio)

    result["In_Regulatory_List"] = score >= 90
    result["Regulatory_Matched_Name"] = match if score >= 90 else ""
    result["Regulatory_Source"] = "OpenSanctions (regulatory.txt)"

def check_in_debarred_list(company_name, result, filepath="/content/debarred.txt"):
    print("Verifying Debarred risks...")
    debarred = pd.read_fwf(filepath, header=None, names=["name"])
    names_list = debarred["name"].dropna().astype(str).tolist()
    match, score, _ = process.extractOne(company_name, names_list, scorer=fuzz.token_sort_ratio)

    result["In_Debarred_List"] = score >= 90
    result["Debarred_Matched_Name"] = match if score >= 90 else ""
    result["Debarred_Source"] = "OpenSanctions (debarred.txt)"

def check_in_pep_list(company_name, result, filepath="pep.txt"):
    print("Verifying PEP risks...")
    pep = pd.read_fwf(filepath, header=None, names=["name"])
    names_list = pep["name"].dropna().astype(str).tolist()
    match, score, _ = process.extractOne(company_name, names_list, scorer=fuzz.token_sort_ratio)

    result["Is_PEP"] = score >= 90
    result["PEP_Matched_Name"] = match if score >= 90 else ""
    result["PEP_Source"] = "OpenSanctions (pep.txt)"



def get_other_info_company(company_name):
    print(f"Company is {company_name}")
    result = {
        "Company Name": company_name,
        "Sanctioned (OFAC)": False,
        "Sanctions URL": "",
        "In Panama Papers": False,
        "Panama Leak URL": "",
        "Likely NGO": False,
        "NGO Wiki URL": "",
        "Mentioned in Negative News": False,
        "Average Sentiment Score": 0.0,
        "News Articles": []
    }
    check_sanctions(company_name, result)
    check_panama_papers(company_name, result)
    check_if_ngo(company_name, result)
    analyze_sentiment_with_finbert(company_name, result)
    check_in_warrants_list(company_name, result)
    check_in_debarred_list(company_name, result)

    final_result = pd.DataFrame([result])
    return final_result

def get_other_info_person(person):
    print(f"Person is {person}")
    result = {
        "Person Name": person,
        "Sanctioned (OFAC)": False,
        "Sanctions URL": "",
        "Is_PEP": False}

    check_sanctions(person, result)
    check_panama_papers(person, result)
    check_if_ngo(person, result)
    analyze_sentiment_with_finbert(person, result)
    check_in_warrants_list(person, result)
    check_in_debarred_list(person, result)
    check_in_pep_list(person, result)

    final_result = pd.DataFrame([result])
    return final_result




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [48]:
import spacy
import pandas as pd
import google.generativeai as genai
import json

prompt_org = f"Assume you are finance risk officer. You will be given 2 jsons. one json will have information on company name, corrobaration score, conformity score, the same for its parent, headquarters for both and possible CIK number. You have to calculate a risk score for that. A lower conformation and corrobartion score means it's compliant so lower risk. If SIC and CIK are available, that's lower risk because frequent filings. If parent is headquartered in a tax-haven, that needs to be penalised as risky. There will also be a confidence score for how confident we are of that, you need to generate a confidence score your risk score as well. Your output should be risk score, confidence score average and a summary of how you got there under key 'summary'. Json 2 will have true/false values on sanctions ofac and sanction others, presence in panama papers, in warrants, and in debarred lists. Any of these being true must be penalised much harshly than other. There will also be a sentiment analysis score, which if negative should be used to penalise but not extremely. This should have same output as previous json. At the end, final output should be final risk score average, final confidence average, full explaination from both json and a entity type classification of the company into ngo, corporation, shell company etc. Provide only final json output of risk score,confidence score, entity type and a list of supporting evidence from the json's where you got the scores from. Do not give me analysis of the seperate jsons, do not show the calculation. Supporting evidence should be a list of strings like Panama Papers, OFAC Sanctions. If you used corrobating, conformity and headquarters, th evidence is LEI sources. If any othe sanction or simialr values had returned false, don't add that in the evidence.Make the json as simple as possible like how an API response would be. All scores are between 0 and 1. All 5 keys including summary cannot be skipped and should be present. Summary cannot be empty. Supporting evidence must always be a list. The keys have to be risk_score, confidence_score, entity_type, summary and supporting_evidence. Don't change that. Json1 json 2 are as follows."
prompt_people = f"Assume you are finance risk officer. You will be given a json object about a person which will have information on if they were sanction, debarred, person of interest etc. Each will have score, everything except sentiment analysis will have high penalistion. Your output should be risk score, confidence score average and a small summary of how you got there. Json 2 will have true/false values on sanctions ofac and sanction others, presence in panama papers, in warrants, and in debarred lists. Any of these being true must be penalised much harshly than other. There will also be a sentiment analysis score, which if negative should be used to penalise but not extremely. This should have same output as previous json. At the end, final output should be final risk score average, final confidence average, a summary of how you got there under key 'summary' and a entity type classification of the company into pep, sanctioned, watchlisted. If none of these, just say person entity. Provide only final json output of risk score,confidence score, entity type and a list of supporting evidence from the json's where you got the scores from. Do not give me analysis of the seperate jsons, do not show the calculation. Supporting evidence should be a list of strings like Panama Papers, OFAC Sanctions for PEP, Sanctions etc. If any othe sanction or simialr values had returned false, don't add that in the evidence.Make the json as simple as possible like how an API response would be.All scores are between 0 and 1, and need to be able to be parsed as numbers.  The keys have to be risk_score, confidence_score, entity_type, summary and supporting_evidence. Don't change that. Json1 is as follows"
genai.configure(api_key="")

model = genai.GenerativeModel("gemini-2.0-pro-exp")

nlp = spacy.load("./fine_tuned_spacy_model")


text = "On March 15th, 2024, a large international wire transfer was initiated from HSBC Holdings in London to BlackRock Inc. in New York. The transaction, valued at $450 million, was flagged due to references to Elon Musk and Christine Lagarde in accompanying documents."

def get_info(text):
    r = process_entities(get_entities(text))
    df = r[r['label'] != 'LOC']
    result = {
    "text": df["text"].tolist(),
    "label": df["label"].tolist(),
    "risk_score": df["risk_score"].tolist(),
    "confidence_score": df["confidence_score"].tolist(),
    "entity_type": df["entity_type"].tolist(),
    "supporting_evidence": df["supporting_evidence"].tolist(),
    "summary": " ".join(df["summary"].tolist())
    }
    result_df = pd.DataFrame(result)
    return result_df

def get_entities(text):
    doc = nlp(text)
    entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
    return pd.DataFrame(entities)

def process_entities(df):
    transaction_entity_info = df
    for idx, row in df.iterrows():
        name = row["text"]
        label = row["label"]

        if label == "ORG":
            print(f"\n Processing ORG: {name}")
            json1 = get_other_info_company(name).to_json(orient="records", indent=2)
            json2 = get_company_info(name).to_json(orient="records", indent=2)
            response_pd = get_gemini_score_org(json1, json2)
            response_pd['supporting_evidence'] = response_pd['supporting_evidence'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else str(x))
            for col in response_pd.columns:
                transaction_entity_info.at[idx, col] = response_pd.iloc[0][col]

        elif label == "PER":
            print(f"\n Processing PER: {name}")
            json1 = get_other_info_person(name).to_json(orient="records", indent=2)
            response_pd = get_gemini_score_person(json1)
            response_pd['supporting_evidence'] = response_pd['supporting_evidence'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else str(x))
            for col in response_pd.columns:
                transaction_entity_info.at[idx, col] = response_pd.iloc[0][col]
        else:
            print(f"\n Skipping: {name} (label: {label})")
    return transaction_entity_info

def get_gemini_score_org(json1, json2):
    print("Getting Gemini score for org...")
    response = model.generate_content(prompt_org + json1 + json2)
    print(response.text)
    cleaned = str(response.text).replace("```json", "").replace("```", "").strip()
    data = json.loads(cleaned)
    return pd.DataFrame([data])

def get_gemini_score_person(json1):
    print("Getting Gemini score for person...")
    response = model.generate_content(prompt_people + json1)
    print(response.text)
    cleaned = str(response.text).replace("```json", "").replace("```", "").strip()
    data = json.loads(cleaned)
    return pd.DataFrame([data])

answer = get_info(text)
answer


 Processing ORG: HSBC Holdings
Company is HSBC Holdings
Verifyin sanctions...
Verifying Panama Papers...
Verifying if NGO...
Verifying Yahoo News...
Performing sentiment analysis...
Verifying warrants...
Verifying Debarred risks...
Fetching SEC-EDGAR and LEI info for HSBC Holdings


  final_df = final_df.groupby(axis=1, level=0).first()


Getting Gemini score for org...
```json
{
  "risk_score": 0.25,
  "confidence_score": 0.94,
  "entity_type": "Corporation",
  "summary": "Risk score calculated based on LEI data (conformity, corroboration, CIK/SIC presence, headquarters) and external checks (sanctions, adverse media). Risk remains low due to full corroboration, LEI conformity, presence of regulatory identifiers (CIK/SIC), and non-tax haven headquarters. No sanctions, warrants, debarment, or Panama Papers links were found. A slight increase in risk is attributed to mentions in negative news, partially offset by a slightly positive average sentiment score. Confidence in the assessment is high, based on reliable LEI data and verified checks, moderated slightly by the confidence in news sentiment analysis.",
  "supporting_evidence": [
    "LEI sources",
    "Negative News"
  ]
}
```

 Skipping: London (label: LOC)

 Processing ORG: BlackRock Inc.
Company is BlackRock Inc.
Verifyin sanctions...
Verifying Panama Papers...
Ve

  final_df = final_df.groupby(axis=1, level=0).first()


Getting Gemini score for org...
```json
{
  "risk_score": 0.1,
  "confidence_score": 0.975,
  "entity_type": "Corporation",
  "summary": "Risk is assessed as very low. This is based on perfect conformity and corroboration scores from LEI data for both the entity and its parent, headquarters located in the US (not a tax haven), and the availability of a CIK number suggesting regular filings. Additionally, the entity has no adverse findings regarding OFAC or other sanctions, Panama Papers involvement, warrants, or debarment lists. Average news sentiment is slightly positive, further supporting the low-risk assessment despite some negative news mentions.",
  "supporting_evidence": [
    "LEI sources",
    "CIK"
  ]
}
```

 Skipping: New York (label: LOC)

 Processing PER: Christine Lagarde
Person is Christine Lagarde
Verifyin sanctions...
Verifying Panama Papers...
Verifying if NGO...
Verifying Yahoo News...
Performing sentiment analysis...
Verifying warrants...
Verifying Debarred risks..

Unnamed: 0,text,label,risk_score,confidence_score,entity_type,supporting_evidence,summary
0,HSBC Holdings,ORG,0.25,0.94,Corporation,"LEI sources, Negative News",Risk score calculated based on LEI data (confo...
1,BlackRock Inc.,ORG,0.1,0.975,Corporation,"LEI sources, CIK",Risk score calculated based on LEI data (confo...
2,Christine Lagarde,PER,0.85,0.9,PEP,"PEP, Negative Sentiment Score, Mentioned in Ne...",Risk score calculated based on LEI data (confo...


In [50]:
from transformers import pipeline

summarizer = pipeline("summarization")

def summarize_dict(input_dict):
  summary = input_dict.get('summary')
  summaries = summarizer(summary, max_length=50, min_length=30, do_sample=False)
  input_dict['summary'] = summaries[0]['summary_text']
  return input_dict



No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [51]:
answer_final = {
    "entities": answer["text"].tolist(),
    "label": answer["label"].tolist(),
    "risk_score": answer["risk_score"].tolist(),
    "confidence_score": answer["confidence_score"].tolist(),
    "entity_type": answer["entity_type"].tolist(),
    "supporting_evidence": answer["supporting_evidence"].tolist(),
    "summary": " ".join(answer["summary"].tolist())  # Concatenate reasoning into one string
}

response_text = summarize_dict(answer_final)
response_text

{'entities': ['HSBC Holdings', 'BlackRock Inc.', 'Christine Lagarde'],
 'label': ['ORG', 'ORG', 'PER'],
 'risk_score': [0.25, 0.1, 0.85],
 'confidence_score': [0.94, 0.975, 0.9],
 'entity_type': ['Corporation', 'Corporation', 'PEP'],
 'supporting_evidence': ['LEI sources, Negative News',
  'LEI sources, CIK',
  'PEP, Negative Sentiment Score, Mentioned in Negative News'],
 'summary': " Risk score calculated based on LEI data (conformity, corroboration, CIK/SIC presence, headquarters) and external checks (sanctions, adverse media) Risk score is elevated primarily due to individual's confirmed Politically"}