In [None]:
import pandas as pd

In [None]:
# Read in the CSVs needed to make the join.
ticker = pd.read_csv('data/asx-tickers.csv')
headlines = pd.read_csv('data/abcnews-date-text.csv')

In [None]:
ticker.head(2)

In [None]:
headlines.head(2)

In [None]:
# Split the DF into chunks, and process each in a seperate thread on your CPU

import numpy as np
from multiprocessing import cpu_count, Pool

cores = cpu_count()
 
def parallelize(data, func):
    data_split = np.array_split(data, cores)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [None]:
# Attempt to Fuzzy Match up to FIRSTN records

from fuzzywuzzy import process, fuzz
from tqdm import tqdm

tqdm.pandas(desc="HALP")

FIRSTN = 1000

# Fuzzy match function
def fuzzy_match(x, choices, scorer, cutoff):
    return process.extractOne(x, choices=choices, scorer=scorer, score_cutoff=cutoff)
    

# Parallelization function, to run on each split of data
def appfun(df):
    return df.loc[:FIRSTN, 'headline_text'].progress_apply(
        fuzzy_match,
        args=(
            ticker.loc[:, 'company'], 
            fuzz.partial_ratio,
            80
        )
    )

# Run in parallel
# matching_results = parallelize(headlines, appfun)
# test = headlines[:FIRSTN].copy()
# test['match'] = matching_results
# test[test['match'].notnull()]

In [None]:
# Sandbox

import nltk
nltk.download('averaged_perceptron_tagger')
nouns = ['NNS', 'NNP', 'NNPS']
string = "qantas urged to update security in shadow of".split()
tags = nltk.tag.pos_tag(string)
filtered = " ".join([x[0] for x in tags if x[1] in nouns])
actual = 'QANTAS AIRWAYS LIMITED'.lower()
print(filtered, actual)
fuzz.ratio(filtered, actual)

In [None]:
## Attempt to Fuzzy Match only the nouns of a sentence, up to FIRSTN records.

from fuzzywuzzy import process, fuzz
from tqdm import tqdm
import nltk
nltk.download('averaged_perceptron_tagger')

tqdm.pandas(desc="HALP")

nouns = ['NNS', 'NNP', 'NNPS']

FIRSTN = 1000

# Fuzzy match function
def fuzzy_match(x, choices, scorer, cutoff):
    tags = nltk.tag.pos_tag(x.split())
    filtered = " ".join([x[0] for x in tags if x[1] in nouns])
    if filtered == '':
        return
    return process.extractOne(filtered, choices=choices, scorer=scorer, score_cutoff=cutoff)
    

# Parallelization function, to run on each split of data
def appfun(df):
    return df.loc[:FIRSTN, 'headline_text'].progress_apply(
        fuzzy_match,
        args=(
            ticker.loc[:, 'company'].map(lambda x: x.lower()), 
            fuzz.partial_ratio,
            65
        )
    )

# Run in parallel
matching_results = parallelize(headlines, appfun)
test = headlines[:FIRSTN].copy()
test['match'] = matching_results
test[test['match'].notnull()]

In [None]:
## Custom Approach

"""
First we want to clean up the company column to make computing percentage of matching 
characters easier:

1. lowercase the entire company column
   a. remove punctuation (except spaces).
   b. Tokenize on spaces and produce a list of each word vectorized.
   
Then we want to look at each headline, first cleaning it so it can match the same as
the companies would (no punctuation).
   
2. for each headline:
    a. remove punctuation (except spaces).
    b. tokenize on spaces and produce a list of each work vectorized.
    
    
    With each cleaned headline, we now have a list of words inside it. For each company we
    also have a list of cleaned words. So now we want to try to score each company on how
    well its words matches the headline words.
    
    c. for each company:
        i. matchedCompanies = []
        
        
        We do this by under the notion the first word of a company MUST be present. So we
        find each word in the headline that starts with the company's first letter. From there
        we compare each word starting from this word in the headline to each word in the
        company. We do a percentage matching.
        
        We also want to weigh each successive word less and less, and we want to have a scoring
        system where higher is better. This allows headlines like "quantas under fire for..."
        to score well with companies like "quantas airlines limited". If Quantas is found, it should
        score super high, yet not allow the mismatch between "under" and "airlines" to drag it down.
        In the event two companies start with the same first word, then the second word matching would
        help distinguish.
        
        
        ii. For each word (W) in headline:
            1. If it doesn't start with the company's first word first letter, skip
            2. CompanyMatchScore = 0
            2. for i,C in Company Word List: (where C is the ith word in their name list)
            3.   if W+i exists
            4.     CompanyMatchScore += percentage_matching(W+i, C) * Weight (the first word should weight far more than latter words)
            
            
            We want to normalize this CompanyMatchScore to be out of 100% when done, as we'll later need
            to compare how different company names of different lengths performed. If we don't normalize
            then names with more words will score higher than those with less.
            
            
            5. normalizedScore = companyMatchScore / len(CompanyWordList) # Normalizes score out of 100%.
            6. matchCompanies.append((company, normalizedScore))
            
        
        Before moving tot the next company, we'll sort the matches to make later filtering easier. We'll
        then append these matches into a new column associated with the headlines ('matches'). We can limit
        the number of matches to keep to some top N (10? 5?) to prevent filling memory with huge vectors.
        
        
        iii. Sort matchedCompanies by their normalizedScore
        iV. df.loc[headlineIndex, 'matches'] = matchedCompanies[:10].


Now we have all our matches with their headlines, we can inspect how well it did and start keeping or discarding matches.
The thought is that some headlines won't have any companies associated so their match score should be low. We'll drop
matches that don't meet a threshold and retain the highest one above the threshold (for those that do meet the threshold).
This will leave some false matches behind (for example if "Bell Labs" is a company and a headline has "time to ring the bell"
in it). But once we match enough companies, we can probably keep only those companies who had more than N headlines matched 
to them for further analysis, allowing us to account for these false positives:
        

3. Ideally, the correct answer is the first element in each match. 
    a. If the first match is > some percentage threshold, keep it as the match
    b. else the headline had no matches and use NA. 
"""