# Process articles
This notebook processes the text of the articles by forcing all words to lowercase and removing punctuation. It also ensures each article contains the word "terror", "terrorism", or "terrorist" at least once and removes those words to prevent clustering on them. It removes stopwords and other vocabulary associated with the New York Times website or otherwise deemed uninteresting to improve clustering. Finally, it tokenizes the articles by word and selects the adjectives and adverbs.

In [20]:
import re
import nltk
import string
import pickle

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
def lower_alpha_num(corpus):
    # convert to lower case
    corpus = map(str.lower, corpus)
    
    # remove alpha-numerical words
    corpus = map(lambda x: re.sub(r"""\w*\d\w*""", '', x), corpus)
    return list(corpus)


def remove_punct(corpus):
    # regular expression to remove punctuation
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))

    corpus = map(lambda x: punc_re.sub(' ', x), corpus)
    return list(corpus)


def pos_tag(corpus):        
    return list(map(nltk.pos_tag, corpus))


def word_tokens(corpus):
    return list(map(word_tokenize, corpus))


def remove_sws(corpus):
    # stopword removal
    stop_words = stopwords.words('english')
    filter_fun = lambda x: list(filter(lambda x: x not in stop_words, x))

    corpus = generalize_fun(corpus, filter_fun)
    return list(corpus)


def get_adj(doc):
    out = [tag for tag in doc if tag[1][0] =='J' or tag[1][0] == 'R']
    return out


def generalize_fun(corpus, lambda_fun):
    # must handle a list of lists (tokenized docs) and also a simple list
    
    if isinstance(corpus[0], list):
        # list of lists
        corpus = map(lambda_fun, corpus)
    else:
        # single list
        corpus = lambda_fun(corpus)
        
    return list(corpus)


def remove_nyt_words(articles, words = ['story',
                                        'united',
                                        'states',
                                        'american',
                                        'america',
                                        'mr',
                                        'ms',
                                        'said',
                                        'main', 
                                        'continue', 
                                        'reading', 
                                        'advertisement',
                                        'new',
                                        'york',
                                        'times',
                                        'newsletter',
                                        'sletter',
                                        'sign up',
                                        "please verify you're not a robot by clicking the box",
                                        'invalid email address',
                                        "please re-enter",
                                        "you must select a",
                                        "to subscribe to",
                                        "receive occasional updates and special offers",
                                        "products and services",
                                        "thank you for subscribing",
                                        "an error has occurred",
                                        "please try again later",
                                        "view all new york"]):
    out = []
    for article in articles:
        for word in words:
            article = article.lower().replace(word,'')
        out.append(article)  
    
    return out 


def filter_for_terror_words(articles, words = ['terror','terrorism','terrorist']):
    
    out = []
    for article in articles:
        flag = 0
        for word in words:
            if word in article:
                article = article.lower().replace(word,'')
                flag = 1
        if flag == 1:       
            out.append(article)  
    
    return out 


def clean_corpus(corpus):
    corpus = filter_for_terror_words(corpus)
    corpus = remove_nyt_words(corpus)
    corpus = lower_alpha_num(corpus)
    corpus = remove_punct(corpus)
    
    return corpus

### Import nltk vocabulary

In [26]:
nltk_path = '/Users/Chris/ds/metis/notebooks/code_along/data/nltk_data'
nltk.data.path.insert(0, nltk_path)

### Load in articles

In [6]:
paths = ! ls ../data/pickle_jar/*_text.p

In [7]:
yearly_corpus = {}

for path in paths:
    year = path.split('/')[3].rstrip('_text.p')
    article_set = pickle.load(open(path,'rb'))
    yearly_corpus[year] = list(article_set.values())

### Clean articles

In [8]:
clean_yearly_corpus = {k: clean_corpus(v) for k,v in yearly_corpus.items()}
pickle.dump(clean_yearly_corpus,open('../data/pickle_jar/clean_yearly_corpus_1.p','wb'))

In [9]:
# example of cleaned article
clean_yearly_corpus['1993'][0][:200]

'federal investigators  yesterday that several men still at large were believed to be part of the ist group seized this week on the brink of carrying out a spectacular islamic radical plot to assassina'

### Tokenize articles

In [10]:
token_yearly_corpus = {k: word_tokens(v) for k,v in clean_yearly_corpus.items()}

In [11]:
token_yearly_corpus = {k: remove_sws(v) for k,v in token_yearly_corpus.items()}

In [13]:
# example of tokenized article
token_yearly_corpus['1993'][0][:10]

['federal',
 'investigators',
 'yesterday',
 'several',
 'men',
 'still',
 'large',
 'believed',
 'part',
 'ist']

### Parts-of-speech tagging

In [27]:
corpus_tagged = {k: pos_tag(v) for k,v in token_yearly_corpus.items()}
pickle.dump(corpus_tagged,open('../data/pickle_jar/tagged_corpus_1.p','wb'))

In [28]:
# example of tagged tokens
corpus_tagged['1993'][0][:5]

[('federal', 'JJ'),
 ('investigators', 'NNS'),
 ('yesterday', 'NN'),
 ('several', 'JJ'),
 ('men', 'NNS')]

### Filter for ajectives and adverbs

In [29]:
corpus_ad = {k: list(map(get_adj,v)) for k,v in corpus_tagged.items()}
pickle.dump(corpus_ad,open('../data/pickle_jar/corpus_ads_1.p','wb'))

In [30]:
# example of tagged adjectives and adverbs
corpus_ad['1993'][0][:5]

[('federal', 'JJ'),
 ('several', 'JJ'),
 ('still', 'RB'),
 ('large', 'JJ'),
 ('spectacular', 'JJ')]