In [1]:
import pandas as pd
from fuzzywuzzy import process, fuzz
from tqdm import tqdm_notebook, tqdm
import nltk
from pathlib import Path
import string

In [2]:
# Import data and merge CSVs
ticker = pd.read_csv('../data/asx-tickers.csv')
head_1 = pd.read_csv('../data/scraped/2006_2010.csv')
head_2 = pd.read_csv('../data/scraped/2010_2010.csv')
head_3 = pd.read_csv('../data/scraped/2010_2016.csv')
head_4 = pd.read_csv('../data/scraped/2016_2017.csv')
headlines = head_1.append([head_2, head_3, head_4])
combined_headlines = Path('../data/scraped/combined_headlines.csv')
if not combined_headlines.is_file():
    headlines.to_csv('../data/scraped/combined_headlines.csv')

In [3]:
# Clean up the headlines (lightly) - drop excess characters and duplicates
stripped = headlines.titles.apply(lambda x: x.strip('"\' ' ))
headlines_clean = headlines[~headlines.titles.duplicated(keep='first')]

In [4]:
# Drop records that don't have any company words
translator = str.maketrans('','',string.punctuation)

def normalize(s):
    return s.lower().translate(translator).split()

def incompwords(s):
    headwords = set(normalize(s))
    return len(headwords & compwords) > 0

compwords = set(normalize((" ".join(ticker['company'].values))))
headlines_filtered = headlines_clean[headlines_clean.titles.apply(incompwords)]

print("Headlines with company names: ",headlines_filtered.shape[0])
headlines_filtered.head(3)

Headlines with company names:  592608


Unnamed: 0,date,titles
0,2006-01-01,Russia completes Ukraine gas cut-off
1,2006-01-01,High winds cause havoc
3,2006-01-01,Russia takes over G8


In [5]:
# Identify the Named Entities in each headline, associating them to their own
# column within the dataframe. Save this as CSV to prevent later processing needs.
# https://stackoverflow.com/questions/36255291/extract-city-names-from-text-using-python

# !!! This will only run if headline_entity_id.csv isn't in the data folder

import nltk
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

def identify_entities():
    f = headlines_filtered.titles.tolist()
    matches = []
    for line in tqdm(f, total=len(f), unit="headlines"):
        sentences = nltk.sent_tokenize(line)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

        entities = []
        for tree in chunked_sentences:
            entities.extend(extract_entity_names(tree))

        matches.append(", ".join(entities))

    series = pd.Series(matches)
    out = headlines_filtered.copy()
    out['entities'] = series.values
    out.to_csv("../data/headline_entity_id.csv")
    
entitymatch = Path('../data/headline_entity_id.csv')
if not entitymatch.is_file():
    # Build the entity matches
    identify_entities()

[nltk_data] Downloading package punkt to /home/koss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/koss/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/koss/nltk_data...
[nltk_data]   Package words is already up-to-date!
