<H2><center> Corporate Data Extraction

<H4>Description:</H4> This notebook preprocesses company names by removing common terms (e.g., "Ltd," "Solutions," "Technologies" etc) and applying text normalization (lowercasing and lemmatization). It uses TF-IDF vectorization and cosine similarity to match a query name against a database, enabling identification even with variations (e.g., "ANGLO-CARIBBEAN CO., LTD." matches "angl carribean"). The data sources include SEC EDGAR, Wikidata, the OFAC Sanctions Database, and recent news articles mentioning company names.

In [None]:
import os
import numpy as np
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz

In [2]:
class CompanyMatcher:
    def __init__(self, database):
        self.lemmatizer = WordNetLemmatizer()
        self.raw_database = database  
        self.database = [self.preprocess_text(name) for name in database]  # Preprocessed database
        
        self.vectorizer = TfidfVectorizer()
        if self.database:
            self.vectors = self.vectorizer.fit_transform(self.database)
        else:
            self.vectors = None 

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)  
        text = re.sub(r'\s+', ' ', text).strip() 
        
        stopwords = {"corporation", "limited", "ltd", "solutions", "technologies", 
                     "consulting", "consultancy", "services", "systems", "group", 
                     "inc", "pvt", "plc", "co"}
        
        words = text.split()
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in stopwords]  # Apply lemmatization
        return " ".join(words)

    def get_top_cosine_matches(self, query, top_n=5):
        if not self.database:  
            return []

        query_cleaned = self.preprocess_text(query)
        query_vector = self.vectorizer.transform([query_cleaned])
        similarity_scores = cosine_similarity(query_vector, self.vectors).flatten()

        top_n = min(top_n, len(similarity_scores)) 
        top_indices = np.argsort(-similarity_scores)[:top_n]
        top_matches = [(self.raw_database[i], similarity_scores[i]) for i in top_indices]

        return top_matches

    def apply_fuzzy_matching(self, query, candidates):
        if not candidates:
            return None, None

        query_cleaned = self.preprocess_text(query)
        best_match, best_score = max(
            ((name, fuzz.ratio(query_cleaned, self.preprocess_text(name))) for name, _ in candidates),
            key=lambda x: x[1]
        )

        return (best_match, best_score)

    def find_best_match(self, query, top_n=5):
        top_matches = self.get_top_cosine_matches(query, top_n)
        best_match, best_score = self.apply_fuzzy_matching(query, top_matches)
        
        return best_match, best_score

In [None]:
class CompanyScreening:
    def __init__(self, ofac_list):
        self.SEC_BASE_URL = "https://www.sec.gov/cgi-bin/browse-edgar"
        self.WIKIDATA_URL = "https://query.wikidata.org/sparql"
        self.NEWS_API_KEY = os.getenv("News_APIKEY")
        self.OFAC_LIST_FILE = ofac_list
        with open(self.OFAC_LIST_FILE, "r") as f:
            self.ofac_companies = [line.strip() for line in f]
        self.matcher = CompanyMatcher(self.ofac_companies)

    def check_sec_edgar(self, company_name):
        params = {"action": "getcompany", "company": company_name, "output": "atom"}
        headers = {"User-Agent": "XXX (xxx@yyy.com)"}
        response = requests.get(self.SEC_BASE_URL, params = params, headers = headers)
        if response.status_code == 200 and "No matching companies" not in response.text:
            soup = BeautifulSoup(response.text, "xml")
            cik_tag = soup.find("cik")
            edgar_profile_url = None
            recent_8k_filings = []
            
            if cik_tag:
                cik = cik_tag.text.strip()

            three_years_ago = datetime.now() - timedelta(days=3*365)
            filings = soup.find_all("entry")

            for filing in filings:
                title = filing.find("title").text
                date_str = filing.find("updated").text[:10]
                filing_date = datetime.strptime(date_str, "%Y-%m-%d")

                if "8-K" in title and filing_date >= three_years_ago:
                    recent_8k_filings.append(title)

            return {
                "SEC Registered": "Yes",
                "CIK": cik if cik_tag else "Not available",
                "Recent 8-K Filings": recent_8k_filings[:3] if recent_8k_filings else "None"
            }
        return {
            "SEC Registered": "No"
        }

    def check_ofac_sanctions(self, company_name, threshold=85):
        match, score = self.matcher.find_best_match(company_name)
        return {
            "OFAC Sanctioned": "Yes" if score >= threshold else "No",
            "Closest OFAC Database Match": match if score >= threshold else "None",
        }

    def check_wikidata_scandals(self, company_name):
        
        query = f"""
        SELECT ?company ?companyLabel ?industry ?industryLabel ?scandal ?scandalLabel ?description WHERE {{
          ?company rdfs:label "{company_name}"@en.
          ?company wdt:P31 wd:Q4830453.  # Instance of (Business/Company)
          OPTIONAL {{ ?company wdt:P452 ?industry. }}  # Industry type
          OPTIONAL {{ ?company schema:description ?description. FILTER (LANG(?description) = "en") }}

          # Looking for scandals
          OPTIONAL {{ ?company wdt:P793 ?scandal. }}  # Significant events (may include fraud cases, controversies)
          OPTIONAL {{ ?company wdt:P5053 ?scandal. }} # Cause of dissolution (bankruptcy, fraud)
          OPTIONAL {{ ?company wdt:P2416 ?scandal. }} # Scandals (direct connection)

          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        """
        response = requests.get(self.WIKIDATA_URL, params={"query": query, "format": "json"})

        if response.status_code != 200:
            return {"Error": "Failed to fetch data"}

        data = response.json().get("results", {}).get("bindings", [])

        if not data:
            return {"Status": "Not Found in Wikidata"}

        result = data[0]  # Take the first matched entry
        company_qid = result["company"]["value"].split("/")[-1]
        
        description = result.get("description", {}).get("value", "No description available")
        if "scandal" in result:
            scandal_name = result["scandalLabel"]["value"]
            scandal_qid = result["scandal"]["value"].split("/")[-1]
            scandal_link = f"https://www.wikidata.org/wiki/{scandal_qid}"

        return {
            "Wikidata_QID": company_qid,
            "Description": description,
            "Scandals": (scandal_name, scandal_link) if "scandal" in result else "No known scandals",
        }
    def get_news(self, company_name):
        url = f"https://newsapi.org/v2/everything?q={company_name}&language=en&sortBy=publishedAt&apiKey={self.NEWS_API_KEY}"
        response = requests.get(url)
        if response.status_code != 200:
            return {"Recent News": "Error fetching news"}

        news_data = response.json()
        articles = news_data.get("articles", [])

        filtered_articles = [article["title"] for article in articles if company_name.lower() in article["title"].lower()]
        return {"Recent News": filtered_articles[:3] if filtered_articles else "No relevant news found"}

    def screen_company(self, company_name):
        result = {"Company": company_name}
        result.update(self.check_sec_edgar(company_name))
        result.update(self.check_ofac_sanctions(company_name))
        result.update(self.check_wikidata_scandals(company_name))
        result.update(self.get_news(company_name))
        return result

In [None]:
company_screening = CompanyScreening("Data/ofac_list.txt")

In [5]:
company = "Theranos"
results = company_screening.screen_company(company)
print(json.dumps(results, indent=4))

{
    "Company": "Theranos",
    "SEC Registered": "Yes",
    "CIK": "0001313697",
    "Recent 8-K Filings": "None",
    "OFAC Sanctioned": "No",
    "Closest OFAC Database Match": "None",
    "Status": "Not Found in Wikidata",
    "Recent News": [
        "Walgreens once ruled. Then came Amazon, Theranos \u2014 and some costly bets",
        "Convicted fraudster and former Theranos CEO Elizabeth Holmes is still guilty [Followup]",
        "Federal court denies Theranos founder Elizabeth Holmes appeal to overturn fraud conviction"
    ]
}


In [6]:
company = "angl carribean"
results = company_screening.screen_company(company)
print(json.dumps(results, indent=4))

{
    "Company": "angl carribean",
    "SEC Registered": "Yes",
    "CIK": "Not available",
    "Recent 8-K Filings": "None",
    "OFAC Sanctioned": "Yes",
    "Closest OFAC Database Match": "ANGLO-CARIBBEAN CO., LTD.",
    "Status": "Not Found in Wikidata",
    "Recent News": "No relevant news found"
}


In [7]:
company = "Wirecard"
results = company_screening.screen_company(company)
print(json.dumps(results, indent=4))

{
    "Company": "Wirecard",
    "SEC Registered": "Yes",
    "CIK": "0001586941",
    "Recent 8-K Filings": "None",
    "OFAC Sanctioned": "No",
    "Closest OFAC Database Match": "None",
    "Wikidata_QID": "Q587325",
    "Description": "insolvent German financial services provider",
    "Scandals": [
        "Wirecard scandal",
        "https://www.wikidata.org/wiki/Q96655771"
    ],
    "Recent News": [
        "Bulgarians convicted in UK of being Russian spies working for Wirecard fugitive (Reuters)",
        "The Wirecard fugitive, Russian intelligence and a Bulgarian spy ring"
    ]
}


In [8]:
company = "Lehman Brothers"
results = company_screening.screen_company(company)
print(json.dumps(results, indent=4))

{
    "Company": "Lehman Brothers",
    "SEC Registered": "Yes",
    "CIK": "0001437948",
    "Recent 8-K Filings": "None",
    "OFAC Sanctioned": "No",
    "Closest OFAC Database Match": "None",
    "Wikidata_QID": "Q212900",
    "Description": "defunct American financial services firm",
    "Scandals": [
        "bankruptcy of Lehman Brothers",
        "https://www.wikidata.org/wiki/Q3269580"
    ],
    "Recent News": [
        "FTX\u2019s US$950 million bankruptcy fees among costliest since Lehman Brothers",
        "FTX\u2019s US$950 million bankruptcy fees among costliest since Lehman Brothers"
    ]
}
