In [25]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
from tqdm import tqdm_notebook, tqdm
# import nltk

In [26]:
# Read in the CSVs needed to make the join.
ticker = pd.read_csv('data/asx-tickers.csv')
headlines = pd.read_csv('data/abcnews-date-text.csv')

In [27]:
ticker.head(2)

Unnamed: 0,ticker,company,industry
0,1AD,ADALTA LIMITED,"Pharmaceuticals, Biotechnology & Life Sciences"
1,1AG,ALTERRA LIMITED,Commercial & Professional Services


In [28]:
headlines.head(2)

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation


In [29]:
# Clear out records that don't have any company word mentions
import string
translator = str.maketrans('','',string.punctuation)

def normalize(s):
    return s.lower().translate(translator).split()

def incompwords(s):
    headwords = set(normalize(s))
    return len(headwords & compwords) > 0

compwords = set(normalize((" ".join(ticker['company'].values))))
headlines_filtered = headlines[headlines['headline_text'].apply(incompwords)]

In [32]:
# Parallelization Code: Will split data evenly amoungst threads
from multiprocessing import cpu_count, Pool

cores = cpu_count()
 
def parallelize(data, func):
    data_split = np.array_split(data, cores)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [None]:
# Attempt to Fuzzy Match up to FIRSTN records
tqdm.pandas(desc="Raw Fuzzy Match")

FIRSTN = 1000

# Fuzzy match function
def fuzzy_match(x, choices, scorer, cutoff):
    return process.extractOne(x, choices=choices, scorer=scorer, score_cutoff=cutoff)
    

# Parallelization function, to run on each split of data
def appfun(df):
    return df.loc[:FIRSTN, 'headline_text'].progress_apply(
        fuzzy_match,
        args=(
            ticker.loc[:, 'company'], 
            fuzz.partial_ratio,
            80
        )
    )

# Run in parallel
# matching_results = parallelize(headlines, appfun)
# test = headlines[:FIRSTN].copy()
# test['match'] = matching_results
# test[test['match'].notnull()]

In [None]:
# Sandbox
nltk.download('averaged_perceptron_tagger')
nouns = ['NNS', 'NNP', 'NNPS']
string = "qantas urged to update security in shadow of".split()
tags = nltk.tag.pos_tag(string)
filtered = " ".join([x[0] for x in tags if x[1] in nouns])
actual = 'QANTAS AIRWAYS LIMITED'.lower()
print(filtered, actual)
fuzz.ratio(filtered, actual)

In [None]:
## Attempt to Fuzzy Match only the nouns of a sentence, up to FIRSTN records.
from fuzzywuzzy import process, fuzz
nltk.download('averaged_perceptron_tagger')

tqdm.pandas(desc="Fuzzy with only Nouns")

nouns = ['NNS', 'NNP', 'NNPS']

FIRSTN = 1000

# Fuzzy match function
def fuzzy_match(x, choices, scorer, cutoff):
    tags = nltk.tag.pos_tag(x.split())
    filtered = " ".join([x[0] for x in tags if x[1] in nouns])
    if filtered == '':
        return
    return process.extractOne(filtered, choices=choices, scorer=scorer, score_cutoff=cutoff)
    

# Parallelization function, to run on each split of data
def appfun(df):
    return df.loc[:FIRSTN, 'headline_text'].progress_apply(
        fuzzy_match,
        args=(
            ticker.loc[:, 'company'].map(lambda x: x.lower()), 
            fuzz.partial_ratio,
            65
        )
    )

# Run in parallel
# matching_results = parallelize(headlines, appfun)
# test = headlines[:FIRSTN].copy()
# test['match'] = matching_results
# test[test['match'].notnull()]

In [None]:
## Custom Approach
tick_df = ticker.copy();
head_df = headlines.copy().head(1000);
head_df['matches'] = None
head_df.astype({'matches':'object'})

print("Vectorizing...")
# 1. Vectorize companies and headlines in their stripped form.
def vectorize(s):
    tokens = normalize(s)
    return [ np.array(list(word)) for word in tokens ]
tick_df['comp_vec'] = tick_df['company'].apply(vectorize)
head_df['head_vec'] = head_df['headline_text'].apply(vectorize)

def percentage_matching(W, C):
    if len(W) < len(C):
        return np.sum(W == C[0:len(W)]) / len(C)
    else:
        return np.sum(W[0:len(C)] == C) / len(W)

# 2. Headline opertion
MIN_SCORE = 0.3
def findMatches(head_record):
    matchedCompanies = []
    headWordList = head_record['head_vec']
    
    for _,comp_record in tick_df.iterrows():
        
        companyWordList = comp_record['comp_vec']
        matchingHeads = []
        
        """
        New option: 
            1. Convert headline and company name to set.
            2. Set intersect. 
            3. For all common words find it's index in the company name
            4. Can compute the weighted worth of the match.
        """
        
        for wIdx in range(len(headWordList)):
            if companyWordList[0][0] != headWordList[wIdx][0]:
                continue
                          
            companyMatchScore = 0
            for cIdx,C in enumerate(companyWordList):
                if  wIdx+cIdx < len(headWordList):
                    weight = 1/(cIdx+1)
                    W = headWordList[wIdx+cIdx]
                    matches = percentage_matching(W, C)
                    if matches == 1.0:
                        companyMatchScore += (matches * weight)
                else:
                    break
                    
            normalizedScore = companyMatchScore / len(companyWordList) # Normalizes score out of 100%.
            if normalizedScore >= MIN_SCORE:
                matchedCompanies.append((comp_record['company'], normalizedScore))
                
    matchedCompanies.sort(key=lambda x: x[1], reverse=True)
    return matchedCompanies[:10]

# Uncomment for parallel and comment the bottom part
# tqdm.pandas(desc="HALP")
# def appfun(df):
#     print('GO')
#     return df.progress_apply(findMatches, axis=1)

# print("Looping...")
# parallelize(head_df, appfun)

# tqdm_notebook().pandas("matching")
# head_df['matches'] = head_df.progress_apply(findMatches, axis=1)
# head_df

In [None]:
## Set Union Approach for faster computation.
tick_df = ticker.copy();
head_df = headlines_filtered.copy();
head_df['matches'] = None
head_df.astype({'matches':'object'})

print("Vectorizing...")
# 1. Vectorize companies and headlines in their stripped form.
tick_df['comp_set'] = tick_df['company'].apply(lambda x: set(normalize(x)))
head_df['head_set'] = head_df['headline_text'].apply(lambda x: set(normalize(x)))
tick_df['comp_list'] = tick_df['company'].apply(lambda x: normalize(x))
head_df['head_list'] = head_df['headline_text'].apply(lambda x: normalize(x))

# 2. Headline opertion
MIN_SCORE = 0.3
def findMatches(head_record):
    matchedCompanies = []
    headWordList = head_record['head_set']
    
    for _,comp_record in tick_df.iterrows():
        
        companyWordList = comp_record['comp_set']
        matchingHeads = []
        
        """
        New option: 
            1. Convert headline and company name to set.
            2. Set intersect. 
            3. For all common words find it's index in the company name
            4. Can compute the weighted worth of the match.
        """
        
        companyMatchScore = 0
        shared = headWordList & companyWordList
        if len(shared) > 0:
            for word in shared:
                idx = comp_record['comp_list'].index(word)
                companyMatchScore += (1/(idx+1))
        normalizedScore = companyMatchScore / len(companyWordList)
        if normalizedScore >= MIN_SCORE:
            matchedCompanies.append((comp_record['company'], normalizedScore))
            
                
    matchedCompanies.sort(key=lambda x: x[1], reverse=True)
    return matchedCompanies[:10]

# Uncomment for parallel and comment the bottom part
tqdm.pandas(desc="HALP")
def appfun(df):
    print('GO')
    return df.progress_apply(findMatches, axis=1)

print("Looping...")
parallelize(head_df, appfun)

# tqdm_notebook().pandas("matching")
# head_df['matches'] = head_df.progress_apply(findMatches, axis=1)
# head_df

Vectorizing...

Looping...
GO


HALP:   0%|          | 29/187597 [00:10<20:55:13,  2.49it/s]

GO


HALP:   0%|          | 7/187596 [00:02<21:44:08,  2.40it/s]]

GO


HALP:   0%|          | 37/187596 [00:14<18:08:28,  2.87it/s]

GO


HALP:   0%|          | 40/187596 [00:15<17:13:04,  3.03it/s]

In [None]:
## Custom Approach

"""
First we want to clean up the company column to make computing percentage of matching 
characters easier:

1. lowercase the entire company column
   a. remove punctuation (except spaces).
   b. Tokenize on spaces and produce a list of each word vectorized.
   
Then we want to look at each headline, first cleaning it so it can match the same as
the companies would (no punctuation).
   
2. for each headline:
    a. remove punctuation (except spaces).
    b. tokenize on spaces and produce a list of each work vectorized.
    
    
    With each cleaned headline, we now have a list of words inside it. For each company we
    also have a list of cleaned words. So now we want to try to score each company on how
    well its words matches the headline words.
    
    c. for each company:
        i. matchedCompanies = []
        
        
        We do this by under the notion the first word of a company MUST be present. So we
        find each word in the headline that starts with the company's first letter. From there
        we compare each word starting from this word in the headline to each word in the
        company. We do a percentage matching.
        
        We also want to weigh each successive word less and less, and we want to have a scoring
        system where higher is better. This allows headlines like "quantas under fire for..."
        to score well with companies like "quantas airlines limited". If Quantas is found, it should
        score super high, yet not allow the mismatch between "under" and "airlines" to drag it down.
        In the event two companies start with the same first word, then the second word matching would
        help distinguish.
        
        
        ii. For each word (W) in headline:
            1. If it doesn't start with the company's first word first letter, skip
            2. CompanyMatchScore = 0
            2. for i,C in Company Word List: (where C is the ith word in their name list)
            3.   if W+i exists
            4.     CompanyMatchScore += percentage_matching(W+i, C) * Weight (the first word should weight far more than latter words)
            
            
            We want to normalize this CompanyMatchScore to be out of 100% when done, as we'll later need
            to compare how different company names of different lengths performed. If we don't normalize
            then names with more words will score higher than those with less.
            
            
            5. normalizedScore = companyMatchScore / len(CompanyWordList) # Normalizes score out of 100%.
            6. matchCompanies.append((company, normalizedScore))
            
        
        Before moving tot the next company, we'll sort the matches to make later filtering easier. We'll
        then append these matches into a new column associated with the headlines ('matches'). We can limit
        the number of matches to keep to some top N (10? 5?) to prevent filling memory with huge vectors.
        
        
        iii. Sort matchedCompanies by their normalizedScore
        iV. df.loc[headlineIndex, 'matches'] = matchedCompanies[:10].


Now we have all our matches with their headlines, we can inspect how well it did and start keeping or discarding matches.
The thought is that some headlines won't have any companies associated so their match score should be low. We'll drop
matches that don't meet a threshold and retain the highest one above the threshold (for those that do meet the threshold).
This will leave some false matches behind (for example if "Bell Labs" is a company and a headline has "time to ring the bell"
in it). But once we match enough companies, we can probably keep only those companies who had more than N headlines matched 
to them for further analysis, allowing us to account for these false positives:
        

3. Ideally, the correct answer is the first element in each match. 
    a. If the first match is > some percentage threshold, keep it as the match
    b. else the headline had no matches and use NA. 
"""