In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
from tqdm import tqdm_notebook, tqdm
import nltk
from pathlib import Path
import string

In [None]:
# Import data and merge CSVs
ticker = pd.read_csv('../data/asx-tickers.csv')
combined_headlines = Path('../data/scraped/combined_headlines.csv')
if not combined_headlines.is_file():
    head_1 = pd.read_csv('../data/scraped/2006_2010.csv')
    head_2 = pd.read_csv('../data/scraped/2010_2010.csv')
    head_3 = pd.read_csv('../data/scraped/2010_2016.csv')
    head_4 = pd.read_csv('../data/scraped/2016_2017.csv')
    headlines = head_1.append([head_2, head_3, head_4])
    headlines.to_csv('../data/scraped/combined_headlines.csv')

headlines = pd.read_csv('../data/scraped/combined_headlines.csv')

In [None]:
# Clean up the headlines (lightly) - drop excess characters and duplicates
stripped = headlines.titles.apply(lambda x: x.strip('"\' ' ))
headlines_clean = headlines[~headlines.titles.duplicated(keep='first')]

In [None]:
# Drop records that don't have any company words
translator = str.maketrans('','',string.punctuation)

def normalize(s):
    return s.lower().translate(translator).split()

def incompwords(s):
    headwords = set(normalize(s))
    return len(headwords & compwords) > 0

compwords = set(normalize((" ".join(ticker['company'].values))))
headlines_filtered = headlines_clean[headlines_clean.titles.apply(incompwords)]

print("Headlines with company names: ",headlines_filtered.shape[0])
headlines_filtered.head(3)

In [None]:
# Identify the Named Entities in each headline, associating them to their own
# column within the dataframe. Save this as CSV to prevent later processing needs.
# https://stackoverflow.com/questions/36255291/extract-city-names-from-text-using-python

# !!! This will only run if headline_entity_id.csv isn't in the data folder

nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

def identify_entities():
    f = headlines_filtered.titles.tolist()
    matches = []
    for line in tqdm(f, total=len(f), unit="headlines"):
        sentences = nltk.sent_tokenize(line)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

        entities = []
        for tree in chunked_sentences:
            entities.extend(extract_entity_names(tree))
            
        if len(entities) == 0:
            matches.append(np.nan)
        else:
            matches.append(", ".join(entities))

    series = pd.Series(matches)
    out = headlines_filtered.copy()
    out['entities'] = series.values
    out = out.dropna()
    out.to_csv("../data/headline_entity_id.csv")
    
entitymatch = Path('../data/headline_entity_id.csv')
if not entitymatch.is_file():
    # Build the entity matches
    identify_entities()
else:
    print("Entities already generated. Delete CSV to rebuild (if desired).")  
headline_entities = pd.read_csv("../data/headline_entity_id.csv")

In [None]:
# Parallelization Code: Will split data evenly amoungst threads
from multiprocessing import cpu_count, Pool

cores = cpu_count()
 
def parallelize(data, func):
    data_split = np.array_split(data, cores)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [None]:
# Attempt to fuzzy match
tqdm().pandas(desc="Fuzzy Match")
FIRSTN = 15000

def fuzzy_match(x, choices, scorer, cutoff):
    if x == '' or type(x) != str:
        return
    return process.extractOne(x, choices=choices, scorer=scorer, score_cutoff=cutoff)
    

# Parallelization function, to run on each split of data
def appfun(df):
    return df.loc[:FIRSTN, 'entities'].progress_apply(
        fuzzy_match,
        args=(
            ticker.loc[:, 'company'].map(lambda x: x.title()), 
            fuzz.ratio,
            70
        )
    )

# Run in parallel (pick one)
# matching_results = parallelize(headline_entities, appfun)

# Run in thread (pick one)
# matching_results = appfun(headline_entities)

# Compare
# test = headline_entities[:FIRSTN].copy()
# test['match'] = matching_results
# test[test['match'].notnull()]

In [None]:
# Attempt the Position Based Algorithm
tick_df = ticker.copy();
tick_df['set'] = tick_df['company'].apply(lambda x: set(x.title().split(" ")))
tick_df['list'] = tick_df['company'].apply(lambda x: x.title().split(" "))

MIN_SCORE = 0.3
def findMatches(headline):
    matchedCompanies = []
    
    # Return if the headline has no entities.
    if headline['entities'] == '' or type(headline['entities']) != str:
        return None
    
    entity_set = set(headline['entities'].split(', '))
    
    # Iterate over sets of companies
    for _,company in tick_df.iterrows():
        matchingHeads = []
        companyMatchScore = 0
        company_set = company['set']
        
        shared = entity_set & company_set
        if len(shared) > 0:
            for word in shared:
                idx = company['list'].index(word)
                companyMatchScore += (1/(idx+1))
        normalizedScore = companyMatchScore / len(company_set)
        if normalizedScore >= MIN_SCORE:
            matchedCompanies.append((company['company'], normalizedScore))
            
                
    matchedCompanies.sort(key=lambda x: x[1], reverse=True)
    return matchedCompanies[:10]

# Run in parallel
tqdm.pandas(desc="Position Match")
def appfun(df):
    return df.progress_apply(findMatches, axis=1)
# matching_results = parallelize(headline_entities, appfun)
matching_results = appfun(headline_entities)
# Compare
test = headline_entities.copy()
test['match'] = matching_results
test['match'] = test['match'].dropna()
test.to_csv("../data/position_match.csv")

In [17]:
matches = pd.read_csv("../data/position_match.csv")
def fixthis(x):
    if x == '[]':
        return None
    else:
        return x
matches.match = matches.match.apply(fixthis)
matches.dropna()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,date,titles,entities,match
3,3,7,7,2006-01-01,Adelaide notch 4-2 win against Roar,"Adelaide, Roar","[('ADELAIDE BRIGHTON LIMITED', 0.3333333333333..."
5,5,9,9,2006-01-01,Adelaide trounce Roar to extend lead,Adelaide,"[('ADELAIDE BRIGHTON LIMITED', 0.3333333333333..."
8,8,13,13,2006-01-01,Sydney records record temperature,Sydney,"[('SYDNEY AIRPORT', 0.5)]"
9,9,15,15,2006-01-01,World welcomes 2006,World,"[('WORLD REACH LIMITED', 0.3333333333333333)]"
12,12,25,25,2006-01-01,Progress made against NSW fires,"Progress, NSW","[('PROGRESS 2010-1 TRUST', 0.3333333333333333)]"
36,36,69,69,2006-01-01,Adelaide end losing streak,Adelaide,"[('ADELAIDE BRIGHTON LIMITED', 0.3333333333333..."
37,37,71,71,2006-01-01,Riot task force to remain in Sydney,"Riot, Sydney","[('SYDNEY AIRPORT', 0.5)]"
52,52,96,96,2006-01-01,Blaze threatens western Vic towns,Blaze,"[('BLAZE INTERNATIONAL LIMITED', 0.33333333333..."
62,62,115,115,2006-01-02,Search called off for Adelaide gunman,"Search, Adelaide","[('ADELAIDE BRIGHTON LIMITED', 0.3333333333333..."
66,66,123,123,2006-01-02,Aussie wildcards culled in Adelaide,"Aussie, Adelaide","[('ADELAIDE BRIGHTON LIMITED', 0.3333333333333..."


In [3]:
fuzzy_matches = pd.read_csv('../data/fuzzy_match.csv')
fuzzy_matches

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,date,titles,entities,match
0,0,0,0,2006-01-01,Russia completes Ukraine gas cut-off,"Russia, Ukraine",
1,1,3,3,2006-01-01,Russia takes over G8,Russia,
2,2,4,4,2006-01-01,Turkish children tested for bird flu,Turkish,
3,3,7,7,2006-01-01,Adelaide notch 4-2 win against Roar,"Adelaide, Roar",
4,4,8,8,2006-01-01,"Famine a national disaster, Kenyan President ...","Famine, Kenyan",
5,5,9,9,2006-01-01,Adelaide trounce Roar to extend lead,Adelaide,
6,6,10,10,2006-01-01,Aust fire crews aid fight against New Caledon...,"Aust, New Caledonian",
7,7,11,11,2006-01-01,Man questioned over Valley stabbing,"Man, Valley",
8,8,13,13,2006-01-01,Sydney records record temperature,Sydney,
9,9,15,15,2006-01-01,World welcomes 2006,World,
