In [1]:
## Load Libraries

import pandas as pd
import os
import re

In [90]:
## Load and Clean Data

raw = pd.read_csv("data/wilshire_052924.csv")
rawn = raw.copy().dropna(subset="Ticker")
rawn["Market Cap ($M)"] = raw["Market Cap ($M)"].str.replace("$", "").str.replace(",", "").astype(float)

corps = rawn[(rawn["Market Cap ($M)"] >= 20)][["Ticker", "Name", "Sector", "Market Cap ($M)"]] \
    .drop_duplicates(subset="Name") \
    .rename(columns={"Market Cap ($M)": "Cap"}) \
    .reset_index(drop=True)

In [104]:
## Cleaned Company Names

companies = [
    re.sub(r"\([^)]*\)", "", str(company))
    .replace(",", "")
    .replace(".", "")
    .replace("`", "'") + " "
    for company in corps["Name"].values
]

# Create composite regex pattern
patterns = [
    "incorporated",
    "corporation",
    "company",
    "companies",
    "holdings",
    "holding",
    " inc ",
    " corp ",
    " & co ",
    " co ",
    " plc ",
    " llc ",
    " ltd "
]
regex_pattern = '|'.join(re.escape(pattern) for pattern in patterns)

# Create cleaned names list
cleaned = [re.sub(regex_pattern, "", company, flags=re.IGNORECASE).strip() for company in companies]

# Handle exceptions
cleaned[cleaned.index("Church & DwightInc")] = "Church & Dwight"
cleaned[cleaned.index("American Water WorksInc")] = "American Water Works"
cleaned[cleaned.index("Best BuyInc")] = "Best Buy"
cleaned[cleaned.index("TREXInc")] = "TREX"
cleaned[cleaned.index("DonaldsonInc")] = "Donaldson"
cleaned[cleaned.index("Simpson ManufacturingInc")] = "Simpson Manufacturing"
cleaned[cleaned.index("MSC Industrial DirectInc")] = "MSC Industrial Direct"
cleaned[cleaned.index("Franklin ElectricInc")] = "Franklin Electric"
cleaned[cleaned.index("Boston BeerInc")] = "Boston Beer"
cleaned[cleaned.index("GannettInc")] = "Gannett"
cleaned[cleaned.index("ManitowocInc")] = "Manitowoc"
cleaned[cleaned.index("Maui Land & PineappleInc")] = "Maui Land & Pineapple"
cleaned[cleaned.index("LannettInc")] = "Lannett"
cleaned[cleaned.index("MerckInc")] = "Merck"
cleaned[cleaned.index("KKRInc")] = "KKR"
cleaned[cleaned.index("McCormickInc")] = "McCormick"
cleaned[cleaned.index("Sturm RugerInc")] = "Sturm Ruger"
cleaned[cleaned.index("GreenhillInc")] = "Greenhill"
cleaned[cleaned.index("Comstock Inc")] = "Comstock"
cleaned[cleaned.index("Team")] = "Team Inc"
cleaned[cleaned.index("Dow")] = "Dow Inc"
cleaned[cleaned.index("Visa")] = "Visa Inc"
cleaned[cleaned.index("Amazoncom")] = "Amazon"
cleaned[cleaned.index("Alarmcom")] = "Alarm.com"
cleaned[cleaned.index("Carscom")] = "Cars.com"
cleaned[cleaned.index("1-800 Flowerscom")] = "1-800 Flowers.com"
cleaned[cleaned.index("CarPartscom")] = "CarParts.com"
cleaned[cleaned.index("Lilly")] = "Eli Lilly"
cleaned[cleaned.index("Meta Platforms")] = "Meta"
cleaned[cleaned.index("Uber Technologies")] = "Uber"
cleaned[cleaned.index("Chipotle Mexican Grill")] = "Chipotle"
cleaned[cleaned.index("Skechers U S A")] = "Skechers USA"
cleaned[cleaned.index("Sanfilippo  & Son")] = "John B Sanfilippo & Son"
cleaned[cleaned.index("Lowe's Cos")] = "Lowe's"
cleaned[cleaned.index("Marsh & McLennan Cos")] = "Marsh & McLennan"
cleaned[cleaned.index("Williams Cos")] = "Williams"
cleaned[cleaned.index("Estee Lauder Cos")] = "Estee Lauder"
cleaned[cleaned.index("Greenbrier Cos")] = "Greenbrier"
cleaned[cleaned.index("Haverty Furniture Cos")] = "Haverty Furniture"
cleaned[cleaned.index("Kingstone Cos")] = "Kingstone"
cleaned[cleaned.index("Noodles &")] = "Noodles & Company"
cleaned[cleaned.index("Superior Group of")] = "Superior Group of Companies"

corps["NameCln"] = cleaned
corps.to_csv("data/corps.csv", index=False)

In [93]:
## Manually Search Output

with open("cleaning.txt", "w") as outfile:
    outfile.write("\n".join(cleaned))

In [94]:
## Group by Sectors and Save

industrials = corps[(corps["Sector"]=="Industrials") | (corps["Sector"]=="Basic Materials")].reset_index(drop=True)
healthcare = corps[(corps["Sector"]=="Healthcare")].reset_index(drop=True)
finance = corps[(corps["Sector"]=="Financial Services") | (corps["Sector"]=="Real Estate")].reset_index(drop=True)
tech = corps[(corps["Sector"]=="Technology") | (corps["Sector"]=="Communication Services")].reset_index(drop=True)
consumer = corps[(corps["Sector"]=="Consumer Cyclical") | (corps["Sector"]=="Consumer Defensive")].reset_index(drop=True)
energy = corps[(corps["Sector"]=="Energy") | (corps["Sector"]=="Utilities")].reset_index(drop=True)

names = ["industrials", "healthcare", "finance", "tech", "consumer", "energy"]

for corp in names:
    df = globals()[corp]
    df.to_csv(f"data/corps/{corp}.csv", index=False)

In [95]:
## Load Data Frames

names = ["industrials", "healthcare", "finance", "tech", "consumer", "energy"]

for corp in names:
    exec(f"{corp} = pd.read_csv('data/corps/{corp}.csv')")