# Get Tickers From Firm Names

Use prior data to try to get the tickers of the firms.

Should use credit rating data, can use sector data additionally if needed.

In [1]:
# Packages
import pandas as pd
import os
import numpy as np
from fuzzy_match import match
from fuzzy_match import algorithims

In [2]:
# Running on full or sample data
sample_run = True

## Load Reference Data

In [3]:
# Load credit rating datasets
# ~\Box\STAT 222 Capstone\Raw Data\Supplementary Credit Rating Data From Kaggle\corporateCreditRatingWithFinancialRatios.csv
# ~\Box\STAT 222 Capstone\Raw Data\Credit Rating Data From Kaggle\corporate_rating.csv
ccrwfr = pd.read_csv('~/Box/STAT 222 Capstone/Raw Data/Supplementary Credit Rating Data From Kaggle/corporateCreditRatingWithFinancialRatios.csv')
print(ccrwfr.head())
corporate_rating = pd.read_csv('~/Box/STAT 222 Capstone/Raw Data/Credit Rating Data From Kaggle/corporate_rating.csv')
print(corporate_rating.head())

                        Rating Agency                     Corporation Rating  \
0  Standard & Poor's Ratings Services       American States Water Co.     A-   
1  Standard & Poor's Ratings Services  Automatic Data Processing Inc.    AAA   
2  Standard & Poor's Ratings Services                      Avnet Inc.   BBB-   
3  Standard & Poor's Ratings Services    California Water Service Co.    AA-   
4  Standard & Poor's Ratings Services            Cardinal Health Inc.      A   

  Rating Date      CIK  Binary Rating  SIC Code Sector Ticker  Current Ratio  \
0  2010-07-30  1056903              1    4941.0  Utils    AWR         1.1507   
1  2010-09-16     8670              1    7374.0  BusEq    ADP         1.1129   
2  2010-11-23     8858              1    5065.0  Shops    AVT         1.9276   
3  2010-06-29  1035201              1    4941.0  Utils    CWT         0.8358   
4  2010-07-14   721371              1    5122.0  Shops    CAH         1.2931   

   ...  EBITDA Margin  Pre-Tax Profit 

In [4]:
# Keep where Rating Agency == "Standard & Poor's Ratings Services" and "Rating Agency Name" == "Standard & Poor's Ratings Services" respectively
s_and_p_ccrwfr = ccrwfr[ccrwfr['Rating Agency'] == "Standard & Poor's Ratings Services"]
s_and_p_corporate_rating = corporate_rating[corporate_rating['Rating Agency Name'] == "Standard & Poor's Ratings Services"]
# Keep columns Ticker and Corporation and Symbol and Name respectively
# Rename to Ticker and Corporation
s_and_p_ccrwfr = s_and_p_ccrwfr[['Ticker', 'Corporation']]
s_and_p_corporate_rating = s_and_p_corporate_rating[['Symbol', 'Name']].rename(columns={'Symbol': 'Ticker', 'Name': 'Corporation'})
# Stack the two datasets
s_and_p_ticker_names = pd.concat([s_and_p_ccrwfr, s_and_p_corporate_rating], axis=0)
# Uppercase Corporation
s_and_p_ticker_names['Corporation'] = s_and_p_ticker_names['Corporation'].str.upper()
# Drop duplicates
s_and_p_ticker_names = s_and_p_ticker_names.drop_duplicates()

## Limit to Items With Calls and Statements

In [5]:
# Load all data
# list of files in '../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates') if f.endswith('.parquet')]
# read in all parquet files
firms_names_used = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates/' + f, columns=['ticker']) for f in file_list]).drop_duplicates()
# Inner join with s_and_p_ticker_names
tickers_and_names_used = firms_names_used.merge(s_and_p_ticker_names, how='inner', left_on='ticker', right_on='Ticker')
# Drop duplicates
tickers_and_names_used = tickers_and_names_used.drop_duplicates()
tickers_and_names_used

Unnamed: 0,ticker,Ticker,Corporation
0,AAPL,AAPL,APPLE INC.
1,ABB,ABB,ABB LTD.
2,ABBV,ABBV,ABBVIE INC.
3,ABC,ABC,AMERISOURCEBERGEN CORP.
4,ABG,ABG,ASBURY AUTOMOTIVE GROUP INC.
...,...,...,...
624,YUM,YUM,YUM! BRANDS INC.
625,YUM,YUM,"YUM! BRANDS, INC."
626,ZBRA,ZBRA,ZEBRA TECHNOLOGIES CORP.
627,ZBRA,ZBRA,ZEBRA TECHNOLOGIES CORPORATION


## Clean Names

Remove INC., CORP, etc. to get names in a format similar to how they would be used in an earnings call

In [6]:
# Names for Cleaning
# Iterate over and print all rows
for index, row in tickers_and_names_used.iterrows():
    print(row['Ticker'], row['Corporation'])

AAPL APPLE INC.
ABB ABB LTD.
ABBV ABBVIE INC.
ABC AMERISOURCEBERGEN CORP.
ABG ASBURY AUTOMOTIVE GROUP INC.
ABG ASBURY AUTOMOTIVE GROUP INC
ABT ABBOTT LABORATORIES
ACHC ACADIA HEALTHCARE COMPANY INC.
ACHC ACADIA HEALTHCARE COMPANY, INC.
ACIW ACI WORLDWIDE INC.
ACIW ACI WORLDWIDE, INC.
ACM AECOM
ACN ACCENTURE PLC
ADI ANALOG DEVICES INC.
ADM ARCHER DANIELS MIDLAND CO.
ADP AUTOMATIC DATA PROCESSING INC.
ADSK AUTODESK, INC.
AEE AMEREN CORP.
AEP AMERICAN ELECTRIC POWER CO. INC.
AGCO AGCO CORPORATION
AIR AAR CORP.
ALB ALBEMARLE CORP.
ALE ALLETE INC.
ALGT ALLEGIANT TRAVEL COMPANY
ALK ALASKA AIR GROUP INC.
ALKS ALKERMES PLC
ALLE ALLEGION PLC
ALR ALERE INC.
ALV AUTOLIV INC.
ALV AUTOLIV, INC.
AMAG AMAG PHARMACEUTICALS, INC.
AMC AMC ENTERTAINMENT HOLDINGS, INC.
AMCX AMC NETWORKS INC.
AMGN AMGEN INC.
AMKR AMKOR TECHNOLOGY INC.
AMKR AMKOR TECHNOLOGY, INC.
AMZN AMAZON.COM INC.
APA APACHE CORP.
APD AIR PRODUCTS AND CHEMICALS INC.
APH AMPHENOL CORP.
AQN ALGONQUIN POWER & UTILITIES CORP.
AR ANTERO RESOU

In [7]:
# Cleaning
tickers_and_names_used_clean = tickers_and_names_used.copy()
# For column Corporation, create new column Corporation_clean:
# Strip "GROUP HOLDING LIMITED"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation'].str.replace('GROUP HOLDING LIMITED', '')
# Strip "& CO."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('& CO.', '')
# Strip "(THE)"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('(THE)', '')
# Strip "RESORT PROPERTIES FINANCE INC."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('RESORT PROPERTIES FINANCE INC.', '')
# Strip "OPERATING CO., LLC"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('OPERATING CO., LLC', '')
# Strip "(FINANCE I) B.V."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('(FINANCE I) B.V.', '')
# Replace "NOVARTIS AG" with "NOVARTIS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('NOVARTIS AG', 'NOVARTIS')
# Replace "COMPASS MINERALS CANADA" with "COMPASS MINERALS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('COMPASS MINERALS CANADA', 'COMPASS MINERALS')
# Replace "AMAZON.COM" with "AMAZON"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('AMAZON.COM', 'AMAZON')
# Replace "T-MOBILE US" with "T-MOBILE"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('T-MOBILE US', 'T-MOBILE')
# Strip "EUROPE S.A.R.L."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('EUROPE S.A.R.L.', '')
# Replace "SAP SE" with "SAP"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('SAP SE', 'SAP')
# Strip "AG & CO. KGAA"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('AG & CO. KGAA', '')
# Strip "S.A.B. DE C.V."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('S.A.B. DE C.V.', '')
# Strip "S.A. DE C.V."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('S.A. DE C.V.', '')
# Strip "S.A.R.L."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('S.A.R.L.', '')
# Strip "S.A.A."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('S.A.A.', '')
# Strip "GS III"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('GS III', '')
# Strip "CO."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('CO.', '')
# Strip "COS."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('COS.', '')
# Strip "INC."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('INC.', '')
# Strip " PLC"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(' PLC', '')
# Strip "CORP."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('CORP.', '')
# Strip "LTD."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('LTD.', '')
# Strip "COMPANY"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('COMPANY', '')
# Strip "CORPORATION"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('CORPORATION', '')
# Strip "S.A."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('S.A.', '')
# Strip " LP"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(' LP', '')
# Strip " LLC"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(' LLC', '')
# Strip "INCORPORATED"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('INCORPORATED', '')
# Strip "INTERNATIONAL"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('INTERNATIONAL', '')
# Strip "INFORMATION SERVICES"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('INFORMATION SERVICES', '')
# Strip "PJSC"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('PJSC', '')
# Strip "N.V."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('N.V.', '')
# Strip " NV"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(' NV', '')
# Strip "L.P."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('L.P.', '')
# Strip "B.V."
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('B.V.', '')
# Strip " ULC"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(' ULC', '')
# Strip "GMBH"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('GMBH', '')
# Strip "GROUP"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('GROUP', '')
# Strip "HOLDINGS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('HOLDINGS', '')
# Strip "OPERATING"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('OPERATING', '')
# Strip "PARTNERS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('PARTNERS', '')
# Strip "(P.H.)"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('(P.H.)', '')
# Strip "IRELAND LTD"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('IRELAND LTD', '')
# Replace "JOHNSON (S.C.) & SON" with "JOHNSON & JOHNSON"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('JOHNSON (S.C.) & SON', 'JOHNSON & JOHNSON')
# Strip "CANADA,"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('CANADA,', '')
# Strip "WORLDWIDE"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('WORLDWIDE', '')
# Strip "SYSTEMS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('SYSTEMS', '')
# Strip "RESEARCH"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('RESEARCH', '')
# Strip "PRODUCTS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('PRODUCTS', '')
# Strip "INDUSTRIAL"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('INDUSTRIAL', '')
# Strip "TECHNOLOGIES"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('TECHNOLOGIES', '')
# Strip "TECHNOLOGY"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('TECHNOLOGY', '')
# Strip "ENERGY"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('ENERGY', '')
# Strip "ELECTRONICS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('ELECTRONICS', '')
# Strip "& MANUFACTURING"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('& MANUFACTURING', '')
# Strip "INFRASTRUCTURE"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('INFRASTRUCTURE', '')
# Replace "MOTOROLA SOLUTIONS" with "MOTOROLA"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('MOTOROLA SOLUTIONS', 'MOTOROLA')
# Strip "PHARMACEUTICALS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('PHARMACEUTICALS', '')
# Strip "COMMUNICATIONS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('COMMUNICATIONS', '')
# Strip "NETWORKS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('NETWORKS', '')
# Strip "NETWORK"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('NETWORK', '')
# Strip "INGREDIENTS"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('INGREDIENTS', '')
# Replace "CDK GLOBAL" with "CDK"
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace('CDK GLOBAL', 'CDK')

# Strip leading and trailing whitespace
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.strip()

# Strip "INC" at the end of strings
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(r'INC$', '', regex=True)
# Strip "CORP" at the end of strings
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(r'CORP$', '', regex=True)
# Strip " US" at the end of strings
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(r' US$', '', regex=True)
# Strip "LIMITED" at the end of strings
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(r'LIMITED$', '', regex=True)
# Strip " CO" at the end of strings
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(r' CO$', '', regex=True)
# Strip " CO LTD" at the end of strings
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(r' CO LTD$', '', regex=True)

# Strip leading and trailing whitespace
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.strip()

# Strip any punctuation on the end of strings
tickers_and_names_used_clean['Corporation_clean'] = tickers_and_names_used_clean['Corporation_clean'].str.replace(r'[^\w\s]$', '', regex=True)

# Revert "PACKAGING CORP. OF AMERICA" to "PACKAGING CORP. OF AMERICA" for Corporation_clean
tickers_and_names_used_clean['Corporation_clean'] = np.where(tickers_and_names_used_clean['Corporation'] == 'PACKAGING CORP. OF AMERICA', 'PACKAGING CORP. OF AMERICA', tickers_and_names_used_clean['Corporation_clean'])
# Revert "SERVICE CORP. INTERNATIONAL" to "SERVICE CORP." for Corporation_clean
tickers_and_names_used_clean['Corporation_clean'] = np.where(tickers_and_names_used_clean['Corporation'] == 'SERVICE CORP. INTERNATIONAL', 'SERVICE CORP.', tickers_and_names_used_clean['Corporation_clean'])
# Revert "GLOBAL PARTNERS LP" to "GLOBAL PARTNERS LP" for Corporation_clean
tickers_and_names_used_clean['Corporation_clean'] = np.where(tickers_and_names_used_clean['Corporation'] == 'GLOBAL PARTNERS LP', 'GLOBAL PARTNERS LP', tickers_and_names_used_clean['Corporation_clean'])

# Iterate over and print all rows
for index, row in tickers_and_names_used_clean.iterrows():
    print(row['Ticker'], '-', row['Corporation'], '-', row['Corporation_clean'])

AAPL - APPLE INC. - APPLE
ABB - ABB LTD. - ABB
ABBV - ABBVIE INC. - ABBVIE
ABC - AMERISOURCEBERGEN CORP. - AMERISOURCEBERGEN
ABG - ASBURY AUTOMOTIVE GROUP INC. - ASBURY AUTOMOTIVE
ABG - ASBURY AUTOMOTIVE GROUP INC - ASBURY AUTOMOTIVE
ABT - ABBOTT LABORATORIES - ABBOTT LABORATORIES
ACHC - ACADIA HEALTHCARE COMPANY INC. - ACADIA HEALTHCARE
ACHC - ACADIA HEALTHCARE COMPANY, INC. - ACADIA HEALTHCARE 
ACIW - ACI WORLDWIDE INC. - ACI
ACIW - ACI WORLDWIDE, INC. - ACI 
ACM - AECOM - AECOM
ACN - ACCENTURE PLC - ACCENTURE
ADI - ANALOG DEVICES INC. - ANALOG DEVICES
ADM - ARCHER DANIELS MIDLAND CO. - ARCHER DANIELS MIDLAND
ADP - AUTOMATIC DATA PROCESSING INC. - AUTOMATIC DATA PROCESSING
ADSK - AUTODESK, INC. - AUTODESK
AEE - AMEREN CORP. - AMEREN
AEP - AMERICAN ELECTRIC POWER CO. INC. - AMERICAN ELECTRIC POWER
AGCO - AGCO CORPORATION - AGCO
AIR - AAR CORP. - AAR
ALB - ALBEMARLE CORP. - ALBEMARLE
ALE - ALLETE INC. - ALLETE
ALGT - ALLEGIANT TRAVEL COMPANY - ALLEGIANT TRAVEL
ALK - ALASKA AIR GROUP IN

In [8]:
# Keep ticker and Corporation_clean
tickers_and_names_used_clean = tickers_and_names_used_clean[['ticker', 'Corporation_clean']].rename(columns = {'ticker': 'matched_ticker'})

## Trigram Fuzzy Match

In [9]:
# Load sample or full data
if sample_run:
    mentions = pd.read_parquet('../../../Data/Company_Mentions/Company_Mentions_Sample.parquet')
else:
    mentions = pd.read_parquet('../../../Data/Company_Mentions/Company_Mentions.parquet')

# Limit to final dataset used
# list of files in '../../../Data/All_Data/All_Data_with_NLP_Features'
import os
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_with_NLP_Features') if f.endswith('.parquet')]
# read in all parquet files
data_used = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_with_NLP_Features/' + f, columns = ['ticker', 'earnings_call_date']) for f in file_list])
# Merge with mentions
mentions = mentions.merge(data_used, how='inner', on=['ticker', 'earnings_call_date'])
mentions

In [10]:
# Function to compare string to all items in tickers_and_names_used_clean and return best cleaned name if above a certain cutoff
# Use trigram algorithm
def get_cleaned_name(company_name):
    choices = tickers_and_names_used_clean['Corporation_clean'].tolist()
    cleaned_name, sim = match.extractOne(company_name, choices, match_type='trigram')
    return cleaned_name, sim

# Apply function
mentions['cleaned_name'], mentions['sim'] = zip(*mentions['company_mentioned'].apply(get_cleaned_name))

In [None]:
# If on sample, output 1000 rows to Excel
if sample_run:
    mentions.head(1000).to_excel('../../../Data/Company_Mentions/String_Match_Sample.xlsx', index=False)

In [None]:
# Create new row that is found_cleaned_name if sim >= 0.4, else missing
mentions['found_cleaned_name'] = np.where(mentions['sim'] >= 0.4, mentions['cleaned_name'], np.nan)

# Join on ticker
mentions_with_ticker = mentions.merge(tickers_and_names_used_clean, how='inner', left_on='found_cleaned_name', right_on='Corporation_clean').drop(columns=['Corporation_clean'])
mentions_with_ticker

# If on sample, output 1000 rows to Excel
if sample_run:
    mentions_with_ticker.head(1000).to_excel('../../../Data/Company_Mentions/Company_Mentions_With_Ticker_Sample.xlsx', index=False)