## Stop-word and punctuation removal

#### Stop-words

In [1]:
from nltk.corpus import stopwords

stop_list = stopwords.words('english')

stop_list

[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u"you're",
 u"you've",
 u"you'll",
 u"you'd",
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u"she's",
 u'her',
 u'hers',
 u'herself',
 u'it',
 u"it's",
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u"that'll",
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'th

In [2]:
len(stop_list)

179

#### Punctuation

In [5]:
from string import punctuation

punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
list(punctuation)

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [7]:
stop_list += list(punctuation)

len(stop_list)

211

In [8]:
stop_list

[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u"you're",
 u"you've",
 u"you'll",
 u"you'd",
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u"she's",
 u'her',
 u'hers',
 u'herself',
 u'it',
 u"it's",
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u"that'll",
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'th

#### Adding custom words

In [9]:
stop_list += ['rt', 'via']  # custom list

len(stop_list)

213

#### Fun with Unicode

Unicode categories: https://en.wikipedia.org/wiki/Unicode_character_property#General_Category

Punctuation categories are labelled as P*

In [12]:
from unicodedata import category

In [15]:
# Had to change the following examples by adding a u in front of the string
# Python 2 requires this to consider a string to be unicode, not needed in Python 3 (and probably doesn't work?)
category(u'A')  # Letter, uppercase

'Lu'

In [18]:
# Using the builtin unicode method also works!
category(unicode(u'A'))

'Lu'

In [19]:
category(u'a')  # Letter, lowercase

'Ll'

In [20]:
category(u'.')  # Punctuation, other

'Po'

In [21]:
dashes = ['‒', '–', '—', '―', '⁓']  # https://en.wikipedia.org/wiki/Dash#Common_dashes

'-' in dashes

False

In [26]:
# Here I needed a different fix. unicode(d) produced this error:
# UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 0: ordinal not in range(128)
# So I have to decode it into utf-8 first. Apparently Python 3 does this by default. It seems you can do this:
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
# I saw something that said this is deprecated in Python 3, implying that utf8 is the default (I think).

for d in dashes:
    print(category(d.decode('utf-8')))

Pd
Pd
Pd
Pd
Po


In [31]:
def is_unicode_punct(token):
    try:
        return category(token.decode('utf-8')).startswith('P')
    except TypeError:
        return False

In [28]:
is_unicode_punct('A')

False

In [29]:
is_unicode_punct('.')

True

In [30]:
is_unicode_punct('HELLOOO')

False

#### Putting everything together

In [32]:
from nltk.tokenize import word_tokenize

text = """Python is a widely used high-level programming
language for general-purpose programming,
created by Guido van Rossum and first released in 1991."""
# text from https://en.wikipedia.org/wiki/Python_(programming_language)

tokens = word_tokenize(text)

tokens

['Python',
 'is',
 'a',
 'widely',
 'used',
 'high-level',
 'programming',
 'language',
 'for',
 'general-purpose',
 'programming',
 ',',
 'created',
 'by',
 'Guido',
 'van',
 'Rossum',
 'and',
 'first',
 'released',
 'in',
 '1991',
 '.']

In [34]:
tokens_no_stop = [t for t in tokens if t not in stop_list and not is_unicode_punct(t)]

tokens_no_stop

['Python',
 'widely',
 'used',
 'high-level',
 'programming',
 'language',
 'general-purpose',
 'programming',
 'created',
 'Guido',
 'van',
 'Rossum',
 'first',
 'released',
 '1991']

In [41]:
# My example: exclude all words that start with a capital letter
[t for t in tokens if not category(t[0:1].decode('utf-8')).startswith('Lu')]

['is',
 'a',
 'widely',
 'used',
 'high-level',
 'programming',
 'language',
 'for',
 'general-purpose',
 'programming',
 ',',
 'created',
 'by',
 'van',
 'and',
 'first',
 'released',
 'in',
 '1991',
 '.']