# Company Name Cleaning Example
This notebook demonstrates how to use the company_name_cleaner.py script to standardize firm names in your dataset.

In [None]:
# Import the function from the script file
from company_name_cleaner import clean_firm_names

# Import pandas for data handling
import pandas as pd

## Sample Data
Let's create some sample data to demonstrate the cleaning process

In [None]:
# Create sample data with firm names
data = {
    'firm_name': [
        'Microsoft Corporation',
        'MICROSOFT CORP.',
        'Microsoft Inc',
        'MicroSoft',
        'Apple Inc.',
        'Appl Inc',
        'APPLE',
        'Google LLC',
        'Alphabet Inc (Google)',
        'Google',
        'Amazon.com, Inc.',
        'amazon',
        'AMZN',
        'International Business Machines',
        'IBM Corporation',
        'I.B.M.'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the original data
print("Original DataFrame:")
df

## Clean the Firm Names
Now let's use our cleaning function to standardize the firm names

In [None]:
# Clean the firm names with default settings (threshold=80)
cleaned_df = clean_firm_names(df)

# Display the cleaned data
print("Cleaned DataFrame:")
cleaned_df[['original_firm_name', 'firm_name', 'standardized_firm_name']]

## Adjusting the Threshold
We can adjust the similarity threshold to control how aggressively names are merged

In [None]:
# Try with a higher threshold (more conservative matching)
cleaned_df_strict = clean_firm_names(df, threshold=90)

# Display the results
print("Cleaned DataFrame with higher threshold (90):")
cleaned_df_strict[['original_firm_name', 'firm_name', 'standardized_firm_name']]

In [None]:
# Try with a lower threshold (more aggressive matching)
cleaned_df_loose = clean_firm_names(df, threshold=70)

# Display the results
print("Cleaned DataFrame with lower threshold (70):")
cleaned_df_loose[['original_firm_name', 'firm_name', 'standardized_firm_name']]

## Using With Your Own Data
You can use this with your own dataset by loading it from a CSV file

In [None]:
# Uncomment and modify these lines to use with your own data
# your_df = pd.read_csv('your_data.csv')
# 
# # If your column has a different name, specify it
# cleaned_your_df = clean_firm_names(your_df, column_name='your_column_name')
# 
# # Save the results
# cleaned_your_df.to_csv('cleaned_data.csv', index=False)