In [1]:
import pandas as pd
import re
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
df = pd.read_csv('docs_sub_name_id.csv')

In [4]:
df.head()

Unnamed: 0,uuid,site,author,url,title,text,published,crawled,num_sent,text_w_ids
0,414e35d6f910341a20b360cba572b056de381107,cnbc.com,,https://www.cnbc.com/2018/01/02/cramer-shares-...,Cramer shares his top energy stock picks as oi...,With oil prices fresh off their strongest star...,2018-01-03T01:57:00.000+02:00,2018-01-03T03:32:45.002+02:00,36,With oil prices fresh off their strongest star...
1,6cab1be4aea65a80cdce9042aa651fb101561d58,reuters.com,,https://www.reuters.com/article/us-usa-economy...,U.S. factory activity accelerates; constructio...,"January 3, 2018 / 3:13 PM / Updated 3 hours ag...",2018-01-03T17:39:00.000+02:00,2018-01-03T18:08:25.011+02:00,33,"January 3, 2018 / 3:13 PM / Updated 3 hours ag..."
2,24ff8669e44d62d574a009390d2240db504abef3,cnbc.com,,https://www.cnbc.com/2018/01/03/reuters-americ...,UPDATE 2-Brazil's Petrobras settles U.S. corru...,UPDATE 2-Brazil's Petrobras settles U.S. corru...,2018-01-03T17:07:00.000+02:00,2018-01-03T17:09:30.019+02:00,23,UPDATE 2-Brazil's ORG000010 settles U.S. corru...
3,da4033f2dcf9bd61dae912d2b198fadfcdc5236a,reuters.com,,https://www.reuters.com/article/us-global-econ...,Asian factories end robust 2017 on mixed note;...,"January 2, 2018 / 11:04 AM / Updated 6 hours a...",2018-01-02T12:18:00.000+02:00,2018-01-02T13:17:17.024+02:00,34,"January 2, 2018 / 11:04 AM / Updated 6 hours a..."
4,158f654e4bd62d8ac1860aa409b0a8a71afa05c3,reuters.com,,https://www.reuters.com/article/us-nigeria-pol...,Nigeria's Buhari re-election movement gathers ...,"January 3, 2018 / 5:01 PM / Updated 19 minutes...",2018-01-03T19:03:00.000+02:00,2018-01-03T19:32:37.006+02:00,18,"January 3, 2018 / 5:01 PM / Updated 19 minutes..."


### Performing initial text cleanup

In [5]:
month_clause = '(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)'

regex_templates = {
    # Date in the format"Month day, Year / time /'
    'reuters_date_time': '((('+ month_clause +'\s+\d{1,2},\s+\d{4})))\s/\s\d{1,2}:\d{1,2}\s\w{2}\s/',
    'another_reuters_regex': '(('+ month_clause +'\s+\d{1,2},\s+\d{4}))\s\/\s\d{1,2}:\d{1,2}\s\w{2}\s',
    # Everything before /PRNewswire/: "WHITE PLAINS, N.Y., May 23, 2018 /PRNewswire/ -- Bunge Limited (NYSE: BG) today 
    'cnbc_regex': '.*/PRNewswire/\s--\s',
    # "Updates N hours(minutes) ago"
    'updated_regex': '\s?(Updated )?\d{1,2} (hours|minutes|days) ago',
    # in N minutes(hours)
    'in_minutes': '\s?in\s\d{1,2}\s(minutes|hours|days)',
    # Everything before GLOBE NEWSWIRE
    'globe_newswire': '(.*)\(GLOBE NEWSWIRE\)\s--',
    # Another Date format - '13 April 2018 - '
    'date_regex': '\d{1,2}\s'+ month_clause +'\s\d{4}\s-\s',
    # Business Wire
    'business_wire': '(.*)--\(BUSINESS WIRE\)--',
    # COMMENTS at the beginning
    'comments': '\d{1,6}\sCOMMENTS\s',
    # Place + date at the beginning
    'place_date': '\A(.*),\s(('+ month_clause +'\s+\d{1,2},\s+\d{4}))\s-',
    # Updated an hour ago
    'hour_ago': '\A\s?Updated an hour ago',
    # N Min Read
    'min_read': '\s\d{1,2} Min Read',
    # time + in N minutes
    'time_in_minutes': '\s\d{1,2}\s\w{2}\s\/\sin\s\d{1,2}\sminutes',
    # by with time - By Phil Wahba 7:55 AM EST
    'by_time': '\A\s?By\s(.*)\s\d{1,2}:\d{1,2}\s\w{2}\s\w{3}',
    # by with date without the year with Reuters
    'by_without_year': '\A\s?By\s(.*)'+ month_clause +'\s+\d{1,2}\s\(Reuters\)\s-',
    # by with year 
    'by_with_year': '\A\s?By\s(.*)'+ month_clause +'\s+\d{1,2},\s\d{4}',
    # City, date (Reuters)
    'city_date_reuters': '\A\w+,\s'+ month_clause +'\s+\d{1,2}\s\(Reuters\)\s-',
    # a minute ago
    'minute_ago': '\A\s?a minute ago',
    # 10:30 AM ET Sun,
    'time': '\d{1,2}:\d{2}\s(\w{2}\s)*(Mon|Tue|Wed|Thu|Fri|Sat|Sun)?(,\s)?',
    # another date
    'another_date': '(\d{1,2}\s)?('+ month_clause +')(\s+\d{1,2},)?\s+\d{2,4}',
    # phone format
    'phone_3': '(\d+(/|-){1}\d+(/|-){1}\d{2,4})',
    #phone with letter
    'phone_with_letter': '1-800-\d{3}-\w{4}',
    # just URL starting with HTTP(S)
    'url_regex' : 'https?:\/\/(www\.)?(\S+)',
    #View original content with multimedia: {URL}
    'orig_content_with_multimedia' : 'View original content with multimedia: https?:\/\/.*\n(.*)',
    # View original content
    'orig_content' : 'View original content: https?:\/\/.*\n(.*)',
    # Related Articles - URL
    'related' : 'Related Articles\nhttps?:\/\/.*',
    # About Us: contact details
    'about_us' : 'About Us:(\n(.*))+',
    # View on business wire
    'view_on_bw' : 'View source version on businesswire\.com :',
    # For further information contact:
    'for_further' : 'For further information contact:(\n(.*))+',
    # For more information visit:
    'for_more_info' : 'For more information, visit (https?:\/\/)?(www\.)?(\S+)(\s\.)?',
    # For additional information
    'for_additional' : 'For additional information,(\splease\s)?visit (https?:\/\/)?(www\.)?(\S+)(\s\.)?',
    # This press release features multimedia, view the full..
    'press_release' : 'This press release features multimedia\. View the full release here:\s(https?:\/\/)?(www\.)?(\S+)(\s\.)?',
    # For more information please
    'for_more_info_please' : 'For more information,(\splease)?\svisit\s(https?:\/\/)?(www\.)?(\S+)(\s\.)?',
    #Email
    'email' : '\S*@\S*\s?',
    # Visit at or follow
    'visit_or_follow' : '((V|v)isit)(.*)(at)?follow us on(.*)',
    # For more information
    'for_more_info' : 'For more information[^,]+,\svisit[^.]+.',
    # Follow us on
    'follow_us_on' : 'Follow us on[^.]+.',
    # Follow us on Twitter
    'follow_us_on_twitter' : 'follow us on Twitter(:)?\s@[^\s]+',
    # TO learn more go to
    'go_to' : 'Like (U|u)s on Facebook',
    # LIke this story
    'like_this_story' : 'Like this story?(.*)',
    #phone
    'phone' : '(\+?\s?\d{1,3}\s)?(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})',
    #contact telephone email
    'contact_telephone_email' : 'Contact:(.*)?(Date:)?(.*)?((Tele)?phone)?(.*)?Email:',
    # Matches any URL
    'url_all' : '((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:".,<>?«»“”‘’]))'

}

In [6]:
def find_regex(text, template):
    res = re.sub(template,  ' ', text)
    return res

In [7]:
%%time
df['prep_text'] = df.text_w_ids
for temp in regex_templates:
    df.prep_text = df.prep_text.apply(lambda x: find_regex(x, regex_templates[temp]))

Wall time: 30min 25s


In [8]:
df.sample(10)

Unnamed: 0,uuid,site,author,url,title,text,published,crawled,num_sent,text_w_ids,prep_text
33429,7bfda8e267d1fc7474e6368294e4e74f01602a88,wsj.com,Ben DiPietro,https://blogs.wsj.com/riskandcompliance/2018/0...,Crisis of the Week: Africa Allegations Aggrava...,0 COMMENTS French businessman Vincent Bolloré ...,2018-05-08T14:01:00.000+03:00,2018-05-08T15:15:02.065+03:00,39,0 COMMENTS French businessman Vincent Bolloré ...,French businessman Vincent Bolloré in Paris o...
12199,d2499999b394c5b791e5909a3ab2af0db38ce560,cnbc.com,cnbc.com,http://www.cnbc.com/2018/02/16/business-wire-h...,Hearst Completes Acquisition of WCWG-TV (CW) i...,NEW YORK--(BUSINESS WIRE)-- Hearst today annou...,2018-02-16T19:35:00.000+02:00,2018-02-16T22:39:09.000+02:00,17,NEW YORK--(BUSINESS WIRE)-- Hearst today annou...,Hearst today announced it has completed its ...
38519,d4db62b19500ae4b6b5c05cb27d07ce8a5fd02fd,reuters.com,,https://uk.reuters.com/article/uk-soccer-cosmo...,Cosmos shooting for stars with $500 million plan,"May 24, 2018 / 12:04 AM / Updated an hour ago ...",2018-05-24T03:02:00.000+03:00,2018-05-24T03:20:51.011+03:00,30,"May 24, 2018 / 12:04 AM / Updated an hour ago ...",Updated an hour ago Soccer - Cosmos shooting...
8124,26d4afda208816364b94b9ebd8c375299d53017a,reuters.com,,https://uk.reuters.com/article/uk-soccer-engla...,Manchester City win to go 15 points clear but ...,"January 31, 2018 / 11:02 PM / Updated 9 minute...",2018-02-01T01:00:00.000+02:00,2018-02-01T01:17:54.014+02:00,22,"January 31, 2018 / 11:02 PM / Updated 9 minute...",Manchester City win to go 15 points clear a...
18997,3a0a419b50f174d0ea2d30e1192fe3365afdcd91,reuters.com,,https://uk.reuters.com/article/uk-britain-russ...,Britain and Russia brace for showdown as deadl...,"March 14, 2018 / 12:52 AM / Updated 43 minutes...",2018-03-14T02:51:00.000+02:00,2018-03-14T06:42:22.000+02:00,35,"March 14, 2018 / 12:52 AM / Updated 43 minutes...",Britain expels 23 Russian diplomats over ne...
2165,32417433bb2cc880c89e6c44405b0b8e8cb727ee,cnbc.com,cnbc.com,http://www.cnbc.com/2018/01/05/business-wire-a...,Affiliate of Sun Capital Agrees to Sell Robert...,"BOCA RATON, Fla.--(BUSINESS WIRE)-- Sun Capita...",2018-01-05T17:00:00.000+02:00,2018-01-05T17:53:36.011+02:00,15,"BOCA RATON, Fla.--(BUSINESS WIRE)-- Sun Capita...","Sun Capital Partners, Inc. (“Sun Capital”), ..."
4295,26d61ef04f772f0251162dc5eb1e5118ddb73dd3,reuters.com,,https://www.reuters.com/article/us-tennis-auso...,"Tennis: Last American standing, Sandgren stoke...",MELBOURNE (Reuters) - Those who believe in nom...,2018-01-20T12:52:00.000+02:00,2018-01-20T13:10:18.005+02:00,23,MELBOURNE (ORG000535) - Those who believe in n...,MELBOURNE (ORG000535) - Those who believe in n...
33030,5965f51c375ece2340e74ce18a901890f29ab598,reuters.com,,https://uk.reuters.com/article/uk-soccer-world...,Soccer - Van Marwijk springs Karacic surprise ...,"May 7, 2018 / 1:19 AM / Updated 3 hours ago So...",2018-05-07T04:17:00.000+03:00,2018-05-07T06:18:58.001+03:00,15,"May 7, 2018 / 1:19 AM / Updated 3 hours ago So...",Soccer - Van Marwijk springs Karacic surpri...
10234,59b855d7e45ce228d360d4d478f510381f08eeda,reuters.com,,https://www.reuters.com/article/us-usa-congres...,U.S. budget deals grants $1.5 billion for opio...,"February 9, 2018 / 4:56 PM / Updated 19 minute...",2018-02-09T18:55:00.000+02:00,2018-02-09T19:07:48.005+02:00,23,"February 9, 2018 / 4:56 PM / Updated 19 minute...",U.S. budget deal grants $1.5 billion for dr...
14363,8ddc13b7eabe6976e0ff2632a9fc70dfc8d801b1,reuters.com,,https://in.reuters.com/article/usa-oil-citgo-e...,Exclusive: U.S. investors seek to acquire Russ...,WASHINGTON (Reuters) - A group of U.S. investo...,2018-02-26T21:16:00.000+02:00,2018-02-26T21:42:17.015+02:00,31,WASHINGTON (ORG000009) - A group of U.S. inves...,WASHINGTON (ORG000009) - A group of U.S. inves...


In [9]:
def cleanup(text):
    """
    1. remove extra text patterns
    2. remove actra white spaces (tabs, new lines, spaces, etc.)
    3. bring text to lower case    
    """
    text = " ".join(text.split())
    text = text.lower()
    return text

In [10]:
%%time
df['prep_text'] = df.prep_text.apply(cleanup)

Wall time: 3.75 s


### Tokenizing sentences and adding them to the dataframe

In [11]:
df['prep_text'] = df.prep_text.apply(sent_tokenize)

### Removing punctuation

In [13]:
def remove_all_punct(sentences):
    """
    removes all punctuation except $
    """
    new_sents = []
    for sent in sentences:
        punct="\"|\';,[]:?!=%#()—\\/~*+<>@^{}-.“”‘’"
        for char in punct:
            sent = sent.replace(char,' ')
        new_sents.append(sent)
    return new_sents

def remove_some_punct(sentences):
    """
    removes all punctuation except $ ! " % “”
    """
    new_sents = []
    for sent in sentences:
        punct="|;,\'-[]:?=—#\(\)\\\/~*+<>@^‘’"
        for char in punct:
            sent = sent.replace(char,' ')
        new_sents.append(sent)
    return new_sents

In [14]:
df['prep_text'] = df.prep_text.apply(remove_all_punct)

### Removing stopwords

In [16]:
stop_words = set(stopwords.words('english'))

In [17]:
def remove_stopwords(sentences):
    new_sents = []
    for sent in sentences:
        sent_without_stopwords = [word for word in sent.split() if word not in stop_words]
        new_sents.append(" ".join(sent_without_stopwords))
    return new_sents

In [18]:
%%time
df['prep_text'] = df.prep_text.apply(remove_stopwords)

Wall time: 3.89 s


### Lemmatization

In [22]:
def lemmatize_spacy(sentences):
    new_sents = []
    for sent in sentences:
        sent = nlp(sent)
        lemmatized_sent = []
        for token in sent:
            lemmatized_sent.append(token.lemma_)
        new_sents.append(" ".join(lemmatized_sent))
    return new_sents

lemmatizer = WordNetLemmatizer()
def lemmatize_nltk(sentences):
    new_sents = []
    for sent in sentences:
        token_words=word_tokenize(sent)
        token_words
        lemmatized_sent=[]
        for word in token_words:
            lemmatized_sent.append(lemmatizer.lemmatize(word))
        new_sents.append(" ".join(lemmatized_sent))
    return new_sents

In [23]:
%%time
df['prep_text'] = df.prep_text.apply(lemmatize_spacy)

Wall time: 2h 15min 40s


In [33]:
df.to_csv('docs_preprocessed.csv',index=False)