In [19]:
import pandas as pd
import spacy
import string
nlp = spacy.load('en_core_web_lg') # Need to install the corpus: space download n_core_web_lg 

In [20]:
ticker = pd.read_csv('../data/asx-tickers.csv')
headlines = pd.read_csv('../data/scraped/combined_headlines.csv')

In [21]:
# Clean up the headlines (lightly) - drop excess characters and duplicates
stripped = headlines.titles.apply(lambda x: x.strip('"\' ' ))
headlines_clean = headlines[~headlines.titles.duplicated(keep='first')]

# Drop records that don't have any company words
translator = str.maketrans('','',string.punctuation)

def normalize(s):
    return s.lower().translate(translator).split()

def incompwords(s):
    headwords = set(normalize(s))
    return len(headwords & compwords) > 0

compwords = set(normalize((" ".join(ticker['company'].values))))
headlines_filtered = headlines_clean[headlines_clean.titles.apply(incompwords)]

print("Headlines with company names: ",headlines_filtered.shape[0])
headlines_filtered.head(3)

Headlines with company names:  592608


Unnamed: 0.1,Unnamed: 0,date,titles
0,0,2006-01-01,Russia completes Ukraine gas cut-off
1,1,2006-01-01,High winds cause havoc
3,3,2006-01-01,Russia takes over G8


In [22]:
# Find the subject of a sentence and ORG entities in a sentence.
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
sub_toks = [tok for tok in doc if (tok.dep_ == "nsubj") ]
print(sub_toks)
org_toks = [ent.text for ent in doc.ents if (ent.label_ == "ORG")]
print(org_toks)

[Apple]
['Apple']


In [30]:
# Apply Spacy to a dataframe
subjects = []
orgs = []

sample_test = headlines_filtered.head(1000)

for doc in nlp.pipe(sample_test['titles'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        subjects.append([tok for tok in doc if (tok.dep_ == "nsubj")])
        orgs.append([ent.text for ent in doc.ents if (ent.label_ == "ORG")])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        subjects.append(None)
        orgs.append(None)

sample_test['subject'] = subjects
sample_test['orgs'] = orgs
sample_test.head(500)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0.1,Unnamed: 0,date,titles,subject,orgs
0,0,2006-01-01,Russia completes Ukraine gas cut-off,[Russia],[]
1,1,2006-01-01,High winds cause havoc,[winds],[]
3,3,2006-01-01,Russia takes over G8,[Russia],[]
4,4,2006-01-01,Turkish children tested for bird flu,[children],[]
7,7,2006-01-01,Adelaide notch 4-2 win against Roar,[],[]
8,8,2006-01-01,"Famine a national disaster, Kenyan President ...",[President],[]
9,9,2006-01-01,Adelaide trounce Roar to extend lead,[],[]
10,10,2006-01-01,Aust fire crews aid fight against New Caledon...,[crews],[]
11,11,2006-01-01,Man questioned over Valley stabbing,[Man],[]
13,13,2006-01-01,Sydney records record temperature,[Sydney],[]


In [37]:
# Attempt to match subjects to companies
tick_df = ticker.copy();
companies = tick_df['company'].apply(lambda x: x.title()).tolist()

def find_most(word):
    max_sim = ('', 0)
    if type(word) is list:
        for token in word:
            for company in companies:
                company = nlp(company)
                sim = token.similarity(company)
                if sim > max_sim[1]:
                    max_sim = (company, sim)
    else:
        for company in companies:
            company = nlp(company)
            sim = word.similarity(company)
            if sim > max_sim[1]:
                max_sim = (company, sim)
    return max_sim
sample_test['subject_sim'] = sample_test['subject'].apply(find_most)
sample_test.head(500)

# # Compute Similarity to Subjects
# sub_sims = []
# for doc in nlp.pipe(sample_test['subject'].astype('unicode').values, batch_size=50, n_threads=3):
#     if doc.is_parsed:
#         max_sim = ('', 0)
#         for token in doc:
#             for company in companies:
#                 sim = token.similarity(company)
#                 if sim > max_sim[1]:
#                     max_sim = (company, sim)
#         sub_sums.append(max_sim[0])
#     else:
#         sub_sims.append(None)
# sample_test['subject_sim'] = sub_sims
# sample_test.head(500)

KeyboardInterrupt: 