## NER
- Named Entity Recognition

In [67]:
import pandas as pd
from cytoolz import take, concat
from random import sample
from collections import Counter

# pd.set_option('max_colwidth', 10)

#### read corpus of news articles

In [2]:
df = pd.read_msgpack('https://www.dropbox.com/s/0oqem0qa6oiz02r/articles.msg?dl=1')

In [3]:
len(df)

13400

#### Load spaCy parser

In [4]:
import spacy
from spacy import displacy

In [59]:
nlp = spacy.load('en', disable=[])

In [60]:
len_text = []
for i in df['text']:
    len_text.append(len(i))

len_text[len(i) == max(len_text)]

3565

In [61]:
doc = nlp(df['text'].iloc[3565])
# doc = nlp(df.loc[3565]['text']) 
# different???????

In [68]:
displacy.render(doc, style = 'ent', jupyter=True)

#### Apply spaCy NER to news corpus

In [69]:
%%time

df['doc'] = list(nlp.pipe(df['text']))

Wall time: 35min 54s


#### Aggregate news stories by week

In [70]:
df['date'] = pd.to_datetime(df['date'])
df['week'] = df['date'].dt.to_period('W')
df.groupby('week')['doc'].count()

week
1996-08-19/1996-08-25    600
1996-08-26/1996-09-01    700
1996-09-02/1996-09-08    700
1996-09-09/1996-09-15    700
1996-09-16/1996-09-22    700
1996-09-23/1996-09-29    700
1996-09-30/1996-10-06    700
1996-10-07/1996-10-13    700
1996-10-14/1996-10-20    700
1996-10-21/1996-10-27    700
1996-10-28/1996-11-03    700
1996-11-04/1996-11-10    700
1996-11-11/1996-11-17    700
1996-11-18/1996-11-24    700
1996-11-25/1996-12-01    700
1996-12-02/1996-12-08    700
1996-12-09/1996-12-15    700
1996-12-16/1996-12-22    700
1996-12-23/1996-12-29    700
1996-12-30/1997-01-05    200
Freq: W-SUN, Name: doc, dtype: int64

#### Find most often mentions PERSON names / week

In [77]:
def get_top_persons(docs, k=5):
    freq = Counter(filter(None, (e.orth_.strip()
                                for d in docs
                                for e in d.ents if e.label_ == 'PERSON')))
    return freq.most_common(k)

In [81]:
pd.set_option('max_colwidth', 20)

In [82]:
df.groupby('week')['doc'].apply(get_top_persons)

[('Clinton', 43), ('Lebed', 33), ('Netanyahu', 19), ('Yeltsin', 18), ('M4', 12)]
[('Clinton', 66), ('Lebed', 36), ('Yeltsin', 33), ('Arafat', 27), ('Netanyahu', 26)]
[('Clinton', 56), ('Yeltsin', 51), ('Okinawa', 29), ('Saddam', 27), ('Tyson', 26)]
[('Clinton', 68), ('Bossi', 38), ('Saddam', 36), ('Netanyahu', 30), ('Saddam Hussein', 25)]
[('Yeltsin', 36), ('Clinton', 33), ('Banharn', 31), ('Dole', 18), ('Perry', 18)]
[('Yeltsin', 70), ('Clinton', 54), ('Arafat', 41), ('Netanyahu', 37), ('Rubin', 26)]
[('Clinton', 91), ('Dole', 59), ('Netanyahu', 32), ('Abdullah', 31), ('Arafat', 28)]
[('Dole', 52), ('Clinton', 50), ('M4', 37), ('Arafat', 21), ('Netanyahu', 18)]
[('Clinton', 71), ('Dole', 62), ('Netanyahu', 26), ('Arafat', 26), ('Lebed', 23)]
[('Clinton', 45), ('Lee', 35), ('Chirac', 26), ('Avg', 16), ('Barzani', 15)]
[('Clinton', 64), ('Dole', 52), ('AGM', 28), ('Bill Clinton', 23), ('Rabin', 20)]
[('Clinton', 88), ('Bhutto', 32), ('Yeltsin', 31), ('Kohl', 22), ('Bill Clinton', 21)]
[

week
1996-08-19/1996-08-25    [(Clinton, 43), ...
1996-08-26/1996-09-01    [(Clinton, 66), ...
1996-09-02/1996-09-08    [(Clinton, 56), ...
1996-09-09/1996-09-15    [(Clinton, 68), ...
1996-09-16/1996-09-22    [(Yeltsin, 36), ...
1996-09-23/1996-09-29    [(Yeltsin, 70), ...
1996-09-30/1996-10-06    [(Clinton, 91), ...
1996-10-07/1996-10-13    [(Dole, 52), (Cl...
1996-10-14/1996-10-20    [(Clinton, 71), ...
1996-10-21/1996-10-27    [(Clinton, 45), ...
1996-10-28/1996-11-03    [(Clinton, 64), ...
1996-11-04/1996-11-10    [(Clinton, 88), ...
1996-11-11/1996-11-17    [(Castro, 38), (...
1996-11-18/1996-11-24    [(Clinton, 76), ...
1996-11-25/1996-12-01    [(Clinton, 31), ...
1996-12-02/1996-12-08    [(Greenspan, 53)...
1996-12-09/1996-12-15    [(AGM, 48), (Cli...
1996-12-16/1996-12-22    [(Clinton, 36), ...
1996-12-23/1996-12-29    [(Yeltsin, 40), ...
1996-12-30/1997-01-05    [(Netanyahu, 10)...
Freq: W-SUN, Name: doc, dtype: object

In [86]:
doc[-2], doc[-2].pos_, doc[-2].lemma_, doc[-2].is_stop

(., 'PUNCT', '.', False)

In [87]:
def get_terms(doc):                         # to spot ANN followed by V
    terms = []
    start = --1                             #start will be the token # where the sequnce(Adj.) started. will be from -1, that we haven't seen yet.
    for i in range(len(doc)):
        if doc[i].pos_ in ['NOUN', 'ADJ']:  # if it's N or adj,
            if start == -1:                 #no marked as a start of something yet,
                start = i
                #else: do nothing
        else:                               #if we are at the end? or at the start
            if start != -1 and i > start + 1 and doc[i-1].pos_ == 'NOUN' and not doc[i-1].is_stop:
                #and the preceding word is N
                terms.append(doc[start:i].lower_) #then we take the subpart and lowercase and added to terms
            start = -1
    if start != -1 and i > start + 1 and doc[i-1].is_stop:
        #we also need to check if we are at the end of the sentence
        terms.append(doc[start:].lower_)
    return terms

get_terms(doc)

['environment committee',
 'action programme',
 'water quality',
 'binding framework directive',
 'pronged strategy',
 'its dilemma',
 'action programme',
 'final text',
 'member states',
 'coherent overall policy',
 'water resources',
 'major objectives',
 'environmental committee',
 'overall policy',
 'overall strategy',
 'draft report',
 'its policies',
 'water protection',
 'water policy',
 'treaty references',
 'precautionary principle',
 'such substantive requirements',
 'emission standards',
 'quality objectives',
 'environment committee',
 'daughter directives',
 'water policy',
 'drinking water',
 'bathing waters']

In [88]:
%%time

df['doc'] = list(nlp.pipe(df['text']))
df['terms'] = df['doc'].apply(get_terms)
freq = Counter(concat(df['terms']))
freq.most_common(10)

Wall time: 37min 55s


##### Check out individual users' frequency words

In [96]:
print(freq.most_common(10))

[('last week', 861), ('last year', 735), ('next year', 572), ('central bank', 535), ('news conference', 477), ('interest rates', 400), ('first time', 363), ('last month', 362), ('first half', 344), ('third quarter', 308)]


In [90]:
top = df.groupby('author')['text'] \
        .count() \
        .sort_values(ascending=False)
top.head(10)

KeyError: 'author'

In [None]:
for name in top.head(10).index:
    #as we are interested in not numbers but who(index)
    print('---', name)
    subset = df['author'] == name
    freq = Counter(concat(df[subset]['terms']))
    # take the terms column out, then make all comments into 1, then count freq
    print(', '.join(t for t, f in freq.most_common(50)))
    # each of the term, freq pairs, I will take only the term
    

In [None]:
for name in top.head(10).index:
    print('---', name)
    subset = df['author'] == name
    freq = pd.DataFrame({'all':
                        Counter(concat(df['terms'])),
                        'user':
                        Counter(concat(df[subset]['terms']))})
    freq['pmi'] = np.log2((freq['user'] * np.sum(freq['all'])) / 
                         (freq['all'] * np.sum(freq['user'])))
    print('\n '.join(freq[freq['user']>5]
                  .sort_values('pmi', ascending=False)
                  .head(10)
                  .index))

#### Filter out some if not interested in, as if it's not real people

In [None]:
df = df[~df['author'].isin(['Author1', 'Author2'])]

<b> resorting after filter

In [None]:
top = df.groupby('author')['text'] \
        .count() \
        .sort_values(ascending=False)
top.head(10)

###### visualize posting frequency

In [None]:
top.plot()

In [None]:
top.head(50).plot.bar()

##### % of comments from top X authors

In [None]:
round((sum(top.head(50)) / sum(top) * 100), 3)

<b> % of comments from 1 posting users

In [None]:
round((sum(top == 1) / len(top) * 100)), 3)

<b> tokenize comments

In [None]:
T = TweetTokenizer(perserve_case = False,
                  reduce_len = True,
                  strip_handles = True)
df['tokens'] = df['text'].apply(T.tokenize)

##### Find keywords for top 10 authors