In [2]:
import collections  # 1.5
import multiprocessing as mp  # 1.2
import re  # 1.3

In [3]:
# # 1.3 Split the text into individual words with regular expression
corpus = ("Andy is a data scientist. Andy's boss, Megan, was looking for him, "
          "but Andy was out to lunch. Megan texted Andy, 'How's the deadline"
          " coming along?'")

In [5]:
# Simply splitting the sentence with spaces
print(corpus.split())

['Andy', 'is', 'a', 'data', 'scientist.', "Andy's", 'boss,', 'Megan,', 'was', 'looking', 'for', 'him,', 'but', 'Andy', 'was', 'out', 'to', 'lunch.', 'Megan', 'texted', 'Andy,', "'How's", 'the', 'deadline', 'coming', "along?'"]


In [6]:
# Taking out punctuation
punctuation = ".',?"  # what is the universe of punctuation? How do we handle 's?
for p in punctuation:
    corpus = corpus.replace(p, '')

print(corpus.split())

['Andy', 'is', 'a', 'data', 'scientist', 'Andys', 'boss', 'Megan', 'was', 'looking', 'for', 'him', 'but', 'Andy', 'was', 'out', 'to', 'lunch', 'Megan', 'texted', 'Andy', 'Hows', 'the', 'deadline', 'coming', 'along']


In [7]:
# Regex
word_regex = r'\W+'  # a raw str: one or more (+) non-word characters (\W)
split_corpus = re.split(word_regex, corpus)
print(split_corpus)

['Andy', 'is', 'a', 'data', 'scientist', 'Andys', 'boss', 'Megan', 'was', 'looking', 'for', 'him', 'but', 'Andy', 'was', 'out', 'to', 'lunch', 'Megan', 'texted', 'Andy', 'Hows', 'the', 'deadline', 'coming', 'along']


In [8]:
# a better regex


# word character + zero or more word characters or 's + word character
# OR
# just a word character
word_regex_improved = r"(\w[\w']*\w|\w)"
word_matcher = re.compile(word_regex_improved)
print(word_matcher.findall(corpus))

['Andy', 'is', 'a', 'data', 'scientist', 'Andys', 'boss', 'Megan', 'was', 'looking', 'for', 'him', 'but', 'Andy', 'was', 'out', 'to', 'lunch', 'Megan', 'texted', 'Andy', 'Hows', 'the', 'deadline', 'coming', 'along']


# 1.4 Converting words into lists of lower case tokens


In [13]:
def split_into_words(line):
    word_regex_improved = r"(\w[\w']*\w|\w)"
    word_matcher = re.compile(word_regex_improved)
    return word_matcher.findall(line)

processed_corpus = []

with open("natural-language-data.txt") as f:
    # to handle large text files, we use the file as an iterator
    for line in f:
        processed_corpus.extend(split_into_words(line))

processed_corpus = [w.lower() for w in processed_corpus]

processed_corpus

['natural',
 'language',
 'processing',
 'nlp',
 'is',
 'an',
 'area',
 'of',
 'computer',
 'science',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'natural',
 'languages',
 'in',
 'particular',
 'how',
 'to',
 'program',
 'computers',
 'to',
 'fruitfully',
 'process',
 'large',
 'amounts',
 'of',
 'natural',
 'language',
 'data',
 'challenges',
 'in',
 'natural',
 'language',
 'processing',
 'frequently',
 'involve',
 'speech',
 'recognition',
 'natural',
 'language',
 'understanding',
 'and',
 'natural',
 'language',
 'generation',
 'the',
 'history',
 'of',
 'natural',
 'language',
 'processing',
 'generally',
 'started',
 'in',
 'the',
 '1950s',
 'although',
 'work',
 'can',
 'be',
 'found',
 'from',
 'earlier',
 'periods',
 'in',
 '1950',
 'alan',
 'turing',
 'published',
 'an',
 'article',
 'titled',
 'computing',
 'machinery',
 'and',
 'intelligence',
 'which',
 'proposed',
 'what',
 'is',
 '


## 1.5 Removing uncommon words and stop words

In [14]:
# Before stop word removal
word_counts = collections.Counter(processed_corpus)
word_counts

Counter({'natural': 13,
         'language': 16,
         'processing': 9,
         'nlp': 1,
         'is': 5,
         'an': 3,
         'area': 1,
         'of': 35,
         'computer': 2,
         'science': 1,
         'and': 18,
         'artificial': 1,
         'intelligence': 3,
         'concerned': 1,
         'with': 5,
         'the': 40,
         'interactions': 1,
         'between': 2,
         'computers': 2,
         'human': 3,
         'languages': 2,
         'in': 17,
         'particular': 1,
         'how': 1,
         'to': 15,
         'program': 1,
         'fruitfully': 1,
         'process': 1,
         'large': 1,
         'amounts': 2,
         'data': 9,
         'challenges': 1,
         'frequently': 1,
         'involve': 1,
         'speech': 3,
         'recognition': 2,
         'understanding': 1,
         'generation': 1,
         'history': 1,
         'generally': 3,
         'started': 1,
         '1950s': 1,
         'although': 1,
         

In [15]:
# Define some stop words
stop_words = {
    'ourselves', 'hers', 'between', 'yourself', 'but', 'again',
    'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they',
    'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
    'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as',
    'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we',
    'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
    'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above',
    'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any',
    'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does',
    'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can',
    'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where',
    'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't',
    'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how',
    'further', 'was', 'here', 'than'}

# find least common elements
uncommon_words = word_counts.most_common()[:-10:-1]

processed_corpus = [w for w in processed_corpus if w not in stop_words]
processed_corpus = [w for w in processed_corpus if w not in uncommon_words]

In [16]:
processed_corpus

['natural',
 'language',
 'processing',
 'nlp',
 'area',
 'computer',
 'science',
 'artificial',
 'intelligence',
 'concerned',
 'interactions',
 'computers',
 'human',
 'natural',
 'languages',
 'particular',
 'program',
 'computers',
 'fruitfully',
 'process',
 'large',
 'amounts',
 'natural',
 'language',
 'data',
 'challenges',
 'natural',
 'language',
 'processing',
 'frequently',
 'involve',
 'speech',
 'recognition',
 'natural',
 'language',
 'understanding',
 'natural',
 'language',
 'generation',
 'history',
 'natural',
 'language',
 'processing',
 'generally',
 'started',
 '1950s',
 'although',
 'work',
 'found',
 'earlier',
 'periods',
 '1950',
 'alan',
 'turing',
 'published',
 'article',
 'titled',
 'computing',
 'machinery',
 'intelligence',
 'proposed',
 'called',
 'turing',
 'test',
 'criterion',
 'intelligence',
 'georgetown',
 'experiment',
 '1954',
 'involved',
 'fully',
 'automatic',
 'translation',
 'sixty',
 'russian',
 'sentences',
 'english',
 'authors',
 'claim