## Company Matching Project
### Entity Resolution for Firm Names

#### Learning Objectives:
- Import and inspect datasets
- Clean and standardize company names
- Implement an efficient matching algorithm
- Evaluate and save matched results

---
## Part 1: Data Import & Inspection


In [129]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [130]:
pip install python-Levenshtein

Note: you may need to restart the kernel to use updated packages.


In [131]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from fuzzywuzzy import fuzz, process
from Levenshtein import distance as levenshtein_distance
from multiprocessing import Pool, cpu_count
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

In [132]:
# Load datasets
data1 = pd.read_csv("Data1_Sample.csv")
data2 = pd.read_csv("Data2_Sample.csv")

In [133]:
# Data 1
data1.head()


Unnamed: 0,rcid,company,factset_entity_id
0,568988,"Wolfspeed, Inc.",0010YB-E
1,88757,"C3.ai, Inc.",07W7MZ-E
2,127582,Luminex Corp.,002SFF-E
3,22142783,"Walmart, Inc.",000YMS-E
4,1263833,"Cerence, Inc.",0LCVZ7-E


In [134]:
# Data 2
data2.head()


Unnamed: 0,Company_Name,Exchange:Ticker,Excel_Company_ID,Business_Description,Company_Status,Company_Type,Product_Description
0,Reiss Corporation,-,IQ4289118,"Reiss Corporation, a contract manufacturer, op...",Operating,Private Company,-
1,"Arms & Cole, Inc.",-,IQ12144982,"Arms & Cole, Inc. operates as a residential an...",Operating Subsidiary,Private Company,-
2,"Glenoit Corp., Consumer Products and Ex-Cell D...",-,IQ2458860,Consumer Products and Ex-Cell Division of Glen...,Operating,Assets/Products,-
3,"United Livestock Commodities, Inc.",-,IQ285525694,Wholesales Livestock (100%).,Operating,Private Company,-
4,Avaya Holdings Corp. (NYSE:AVYA),NYSE:AVYA,IQ224812364,"Avaya Holdings Corp., through its subsidiaries...",Operating,Public Company,"A.I.Connect:\nA.I.Connect, an Avaya-led initia..."


## Part 2: Data Cleaning & Preprocessing

In [135]:
def clean_company_name(name):
    name = str(name).lower()
    name = re.sub(r'\b(Corp|Corporation|Inc|Ltd|Llc|Plc|Co|S\.A\.|Gmbh|Ag)\b', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\([^)]*\)', '', name)  # Remove text inside parentheses (tickers, extra info)
    name = re.sub(r'\bNyse\S*|Nasdaq\S*|Amex\S*', '', name, flags=re.IGNORECASE)  # Remove stock exchange tickers
    name = re.sub(r'(?<=\b[A-Z])\s*&\s*(?=[A-Z]\b)', ' & ', name)  # Ensure '&' spacing is preserved
    name = re.sub(r'[^a-zA-Z0-9 &]', '', name)
    name = re.sub(r'\s+', ' ', name)
    return name

In [136]:
# Apply cleaning to both datasets
data1["clean_company"] = data1["company"].apply(clean_company_name)
data2["clean_company"] = data2["Company_Name"].apply(clean_company_name)

In [137]:
data1.head(10)

Unnamed: 0,rcid,company,factset_entity_id,clean_company
0,568988,"Wolfspeed, Inc.",0010YB-E,wolfspeed
1,88757,"C3.ai, Inc.",07W7MZ-E,c3ai
2,127582,Luminex Corp.,002SFF-E,luminex
3,22142783,"Walmart, Inc.",000YMS-E,walmart
4,1263833,"Cerence, Inc.",0LCVZ7-E,cerence
5,896391,"Digi International, Inc.",000HKB-E,digi international
6,288789,"Airgain, Inc.",007H7D-E,airgain
7,1179262,"PowerFleet, Inc.",0L11RW-E,powerfleet
8,22264555,"Leslie's, Inc.",0MY33Y-E,leslies
9,1489790,General Electric Co.,000KYG-E,general electric


In [138]:
data2.head()

Unnamed: 0,Company_Name,Exchange:Ticker,Excel_Company_ID,Business_Description,Company_Status,Company_Type,Product_Description,clean_company
0,Reiss Corporation,-,IQ4289118,"Reiss Corporation, a contract manufacturer, op...",Operating,Private Company,-,reiss
1,"Arms & Cole, Inc.",-,IQ12144982,"Arms & Cole, Inc. operates as a residential an...",Operating Subsidiary,Private Company,-,arms & cole
2,"Glenoit Corp., Consumer Products and Ex-Cell D...",-,IQ2458860,Consumer Products and Ex-Cell Division of Glen...,Operating,Assets/Products,-,glenoit consumer products and excell division
3,"United Livestock Commodities, Inc.",-,IQ285525694,Wholesales Livestock (100%).,Operating,Private Company,-,united livestock commodities
4,Avaya Holdings Corp. (NYSE:AVYA),NYSE:AVYA,IQ224812364,"Avaya Holdings Corp., through its subsidiaries...",Operating,Public Company,"A.I.Connect:\nA.I.Connect, an Avaya-led initia...",avaya holdings


## Part 3: Matching Algorithm Implementation


In [139]:
# Vectorize company names using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))
tfidf_data1 = vectorizer.fit_transform(data1["clean_company"])
tfidf_data2 = vectorizer.transform(data2["clean_company"])

In [140]:
def best_match(company_index):
    company = data1.iloc[company_index]["clean_company"]
    best_tfidf_match_idx = np.argmax(cosine_similarities[company_index])
    best_tfidf_score = cosine_similarities[company_index][best_tfidf_match_idx]
    best_tfidf_match = data2.iloc[best_tfidf_match_idx]["clean_company"]
    
    best_fuzzy_match, fuzzy_score = process.extractOne(company, data2["clean_company"].tolist(), scorer=fuzz.token_sort_ratio)
    best_levenshtein_match = min(data2["clean_company"].tolist(), key=lambda x: levenshtein_distance(company, x))
    best_levenshtein_score = levenshtein_distance(company, best_levenshtein_match)
    
    # weighted scoring with precision filtering
    if fuzzy_score > 90 and best_tfidf_score > 0.7:
        return best_fuzzy_match, best_tfidf_score
    elif best_levenshtein_score < 3 and best_tfidf_score > 0.7:
        return best_levenshtein_match, best_tfidf_score
    elif best_tfidf_score > 0.75:
        return best_tfidf_match, best_tfidf_score
    else:
        return "", 0

In [141]:
# Compute cosine similarity between all company names
cosine_similarities = cosine_similarity(tfidf_data1, tfidf_data2)

In [142]:
with Pool(cpu_count()) as pool:
    matched_results = pool.map(best_match, range(len(data1)))

In [143]:
matched_companies, matched_scores = zip(*matched_results)

In [144]:
data2_dict = data2.set_index("clean_company")["Excel_Company_ID"].to_dict()
matched_ids = [data2_dict.get(match, "") for match in matched_companies]

In [145]:
data1["Matched_Excel_Company_ID"] = matched_ids
data1["Best_Match_Company_Name"] = matched_companies


In [148]:
data1.head(10)

Unnamed: 0,rcid,company,factset_entity_id,clean_company,Matched_Excel_Company_ID,Best_Match_Company_Name
0,568988,"Wolfspeed, Inc.",0010YB-E,wolfspeed,,
1,88757,"C3.ai, Inc.",07W7MZ-E,c3ai,IQ24249083,c3ai
2,127582,Luminex Corp.,002SFF-E,luminex,IQ411496,luminex
3,22142783,"Walmart, Inc.",000YMS-E,walmart,IQ313055,walmart
4,1263833,"Cerence, Inc.",0LCVZ7-E,cerence,IQ634127183,cerence
5,896391,"Digi International, Inc.",000HKB-E,digi international,IQ266694,digi international
6,288789,"Airgain, Inc.",007H7D-E,airgain,IQ10460385,airgain
7,1179262,"PowerFleet, Inc.",0L11RW-E,powerfleet,IQ238361,powerfleet
8,22264555,"Leslie's, Inc.",0MY33Y-E,leslies,IQ691092297,leslies
9,1489790,General Electric Co.,000KYG-E,general electric,IQ177031,general electric company


## Part 4: Results & Evaluation

In [147]:
output_path = "Matched_sample_final.csv"
data1.to_csv(output_path, index=False)