In [None]:
import pandas as pd

In [None]:
# Read in the CSVs needed to make the join.
ticker = pd.read_csv('data/asx-tickers.csv')
headlines = pd.read_csv('data/abcnews-date-text.csv')

In [None]:
ticker.head(2)

In [None]:
headlines.head(2)

In [None]:
# Split the DF into chunks, and process each in a seperate thread on your CPU

import numpy as np
from multiprocessing import cpu_count, Pool

cores = cpu_count()
 
def parallelize(data, func):
    data_split = np.array_split(data, cores)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [None]:
# Attempt to Fuzzy Match up to FIRSTN records

from fuzzywuzzy import process, fuzz
from tqdm import tqdm

tqdm.pandas(desc="HALP")

FIRSTN = 1000

# Fuzzy match function
def fuzzy_match(x, choices, scorer, cutoff):
    return process.extractOne(x, choices=choices, scorer=scorer, score_cutoff=cutoff)
    

# Parallelization function, to run on each split of data
def appfun(df):
    return df.loc[:FIRSTN, 'headline_text'].progress_apply(
        fuzzy_match,
        args=(
            ticker.loc[:, 'company'], 
            fuzz.partial_ratio,
            80
        )
    )

# Run in parallel
# matching_results = parallelize(headlines, appfun)
# test = headlines[:FIRSTN].copy()
# test['match'] = matching_results
# test[test['match'].notnull()]

In [None]:
# Sandbox

import nltk
nltk.download('averaged_perceptron_tagger')
nouns = ['NNS', 'NNP', 'NNPS']
string = "qantas urged to update security in shadow of".split()
tags = nltk.tag.pos_tag(string)
filtered = " ".join([x[0] for x in tags if x[1] in nouns])
actual = 'QANTAS AIRWAYS LIMITED'.lower()
print(filtered, actual)
fuzz.ratio(filtered, actual)

In [None]:
## Attempt to Fuzzy Match only the nouns of a sentence, up to FIRSTN records.

from fuzzywuzzy import process, fuzz
from tqdm import tqdm
import nltk
nltk.download('averaged_perceptron_tagger')

tqdm.pandas(desc="HALP")

nouns = ['NNS', 'NNP', 'NNPS']

FIRSTN = 1000

# Fuzzy match function
def fuzzy_match(x, choices, scorer, cutoff):
    tags = nltk.tag.pos_tag(x.split())
    filtered = " ".join([x[0] for x in tags if x[1] in nouns])
    if filtered == '':
        return
    return process.extractOne(filtered, choices=choices, scorer=scorer, score_cutoff=cutoff)
    

# Parallelization function, to run on each split of data
def appfun(df):
    return df.loc[:FIRSTN, 'headline_text'].progress_apply(
        fuzzy_match,
        args=(
            ticker.loc[:, 'company'].map(lambda x: x.lower()), 
            fuzz.partial_ratio,
            65
        )
    )

# Run in parallel
matching_results = parallelize(headlines, appfun)
test = headlines[:FIRSTN].copy()
test['match'] = matching_results
test[test['match'].notnull()]