In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
from tqdm import tqdm_notebook, tqdm
import nltk
from pathlib import Path
import string

In [2]:
# Import data and merge CSVs
ticker = pd.read_csv('../data/asx-tickers.csv')
combined_headlines = Path('../data/scraped/combined_headlines.csv')
if not combined_headlines.is_file():
    head_1 = pd.read_csv('../data/scraped/2006_2010.csv')
    head_2 = pd.read_csv('../data/scraped/2010_2010.csv')
    head_3 = pd.read_csv('../data/scraped/2010_2016.csv')
    head_4 = pd.read_csv('../data/scraped/2016_2017.csv')
    headlines = head_1.append([head_2, head_3, head_4])
    headlines.to_csv('../data/scraped/combined_headlines.csv')

headlines = pd.read_csv('../data/scraped/combined_headlines.csv')

In [3]:
# Clean up the headlines (lightly) - drop excess characters and duplicates
stripped = headlines.titles.apply(lambda x: x.strip('"\' ' ))
headlines_clean = headlines[~headlines.titles.duplicated(keep='first')]

In [4]:
# Drop records that don't have any company words
translator = str.maketrans('','',string.punctuation)

def normalize(s):
    return s.lower().translate(translator).split()

def incompwords(s):
    headwords = set(normalize(s))
    return len(headwords & compwords) > 0

compwords = set(normalize((" ".join(ticker['company'].values))))
headlines_filtered = headlines_clean[headlines_clean.titles.apply(incompwords)]

print("Headlines with company names: ",headlines_filtered.shape[0])
headlines_filtered.head(3)

Headlines with company names:  592608


Unnamed: 0.1,Unnamed: 0,date,titles
0,0,2006-01-01,Russia completes Ukraine gas cut-off
1,1,2006-01-01,High winds cause havoc
3,3,2006-01-01,Russia takes over G8


In [5]:
# Identify the Named Entities in each headline, associating them to their own
# column within the dataframe. Save this as CSV to prevent later processing needs.
# https://stackoverflow.com/questions/36255291/extract-city-names-from-text-using-python

# !!! This will only run if headline_entity_id.csv isn't in the data folder

import nltk
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

def identify_entities():
    f = headlines_filtered.titles.tolist()
    matches = []
    for line in tqdm(f, total=len(f), unit="headlines"):
        sentences = nltk.sent_tokenize(line)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

        entities = []
        for tree in chunked_sentences:
            entities.extend(extract_entity_names(tree))

        matches.append(", ".join(entities))

    series = pd.Series(matches)
    out = headlines_filtered.copy()
    out['entities'] = series.values
    out.to_csv("../data/headline_entity_id.csv")
    
entitymatch = Path('../data/headline_entity_id.csv')
if not entitymatch.is_file():
    # Build the entity matches
    identify_entities()
else:
    print("Entities already generated. Delete CSV to rebuild (if desired).")
    
headline_entities = pd.read_csv("../data/headline_entity_id.csv")
headline_entities.head(3)

[nltk_data] Downloading package punkt to /Users/Koss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/Koss/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/Koss/nltk_data...
[nltk_data]   Package words is already up-to-date!
Entities already generated. Delete CSV to rebuild (if desired).


Unnamed: 0.1,Unnamed: 0,date,titles,entities
0,0,2006-01-01,Russia completes Ukraine gas cut-off,"Russia, Ukraine"
1,1,2006-01-01,High winds cause havoc,
2,3,2006-01-01,Russia takes over G8,Russia


In [6]:
# Parallelization Code: Will split data evenly amoungst threads
from multiprocessing import cpu_count, Pool

cores = cpu_count()
 
def parallelize(data, func):
    data_split = np.array_split(data, cores)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [11]:
# Attempt to fuzzy match
tqdm_notebook().pandas(desc="Fuzzy Match")
FIRSTN = 15000

def fuzzy_match(x, choices, scorer, cutoff):
    if x == '' or type(x) != str:
        return
    return process.extractOne(x, choices=choices, scorer=scorer, score_cutoff=cutoff)
    

# Parallelization function, to run on each split of data
def appfun(df):
    return df.loc[:FIRSTN, 'entities'].progress_apply(
        fuzzy_match,
        args=(
            ticker.loc[:, 'company'].map(lambda x: x.lower()), 
            fuzz.ratio,
            90
        )
    )

# Run in parallel (pick one)
matching_results = parallelize(headline_entities, appfun)

# Run in thread (pick one)
# matching_results = appfun(headline_entities)

# Compare
test = headline_entities[:FIRSTN].copy()
test['match'] = matching_results
test[test['match'].notnull()]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))








Unnamed: 0.1,Unnamed: 0,date,titles,entities,match
514,871,2006-01-05,Medical ID system aims to save lives,Medical ID,"(im medical ltd, 75, 1002)"
1297,2203,2006-01-11,Security tight for inaugural climate talks,Security,"(test security, 76, 1964)"
2702,4602,2006-01-23,Stockland backs Sunday trading,Stockland,"(stockland, 100, 1812)"
2917,4928,2006-01-24,Sydney Harbour commercial fishing halted,Sydney Harbour,"(sydney airport, 79, 1937)"
3401,5707,2006-01-27,Sydney Harbour fish ban puts spotlight on imp...,Sydney Harbour,"(sydney airport, 79, 1937)"
4444,7427,2006-02-05,Security measures stepped up for Games,Security,"(test security, 76, 1964)"
5728,9519,2006-02-15,BHP Billiton announces record half-year profit,BHP Billiton,"(bhp billiton limited, 75, 314)"
5785,9604,2006-02-15,BHP Billiton posts giant record,BHP Billiton,"(bhp billiton limited, 75, 314)"
5865,9722,2006-02-15,BHP Billiton posts record interim profit,BHP Billiton,"(bhp billiton limited, 75, 314)"
5896,9769,2006-02-16,Industrial action to halt QR trains,Industrial,"(industria reit, 75, 968)"
