# Stopwords
Stopwords are non-informative words that we want to take out of the text before performing analysis on it. Luckily, NLTK has a ready-made list of such words that we can use to preprocess text...but is this enough?

In [1]:
from nltk.corpus import reuters, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
article = reuters.raw(fileids=reuters.fileids(categories='crude')[0])

In [3]:
print(article)

JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
  The Ministry of International Trade and
  Industry (MITI) will revise its long-term energy supply/demand
  outlook by August to meet a forecast downtrend in Japanese
  energy demand, ministry officials said.
      MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.
      The decision follows the emergence of structural changes in
  Japanese industry following the rise in the value of the yen
  and a decline in domestic electric power demand.
      MITI is planning to work out a revised energy supply/demand
  outlook through deliberations of committee meetings of the
  Agency of Natural Resources and Energy, the officials said.
      They said MITI will also review the breakdown of energy
  supply sources, including oil, nuclear, coal and natural gas.
      Nuclear energy provided the bulk of Japan's electric power
  in the fiscal year ended March

In [4]:
sentence = sent_tokenize(article)[1]
print(sentence)

MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.


In [14]:
words = word_tokenize(sentence)
print(words)

['MITI', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for', 'primary', 'energy', 'supplies', 'in', 'the', 'year', '2000', 'to', '550', 'mln', 'kilolitres', '(', 'kl', ')', 'from', '600', 'mln', ',', 'they', 'said', '.']


## NLTK Stopwords

In [20]:
# NLTK list of stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [38]:
stopwords.words('french')[:10]

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle']

In [45]:
stopwords.words('german')[:10]

['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an']

In [53]:
len(stopwords.words())

6800

## Examples of set, lower and sub

In [69]:
# from string
print(set('Python is perfect'))

{'y', 'o', 'n', 'f', 't', 'h', 'i', 'e', 'r', ' ', 'p', 'P', 'c', 's'}


In [70]:
# from tuple
print(set(('a', 'e', 'i', 'o', 'u', 'u', 'u')))


{'o', 'i', 'e', 'u', 'a'}


In [71]:
# from list
print(set(['a','a','a','a', 'e', 'i', 'o', 'u']))

{'o', 'i', 'e', 'u', 'a'}


In [66]:
# from range
print(set(range(15)))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}


In [72]:
('Python').lower()

'python'

In [83]:
# Replace every white-space character with the number 9:

txt = "The rain in Spain"
x = re.sub("\s", "9", txt)
print(x)

The9rain9in9Spain


In [84]:
# Replace the first 2 occurrences
x = re.sub("\s", "9", txt, 2)
print(x)

The9rain9in Spain


## Stopwords contd

In [54]:
# Instantiate the list of stopwords, then convert all to lowercase
sw = set(stopwords.words('english'))
print(sw)

{'whom', "hasn't", 'have', "haven't", 'they', 'who', 'a', 'why', 've', 't', 'will', 'is', 'few', 'from', 'if', 'needn', 'out', "you'll", 're', 'under', 'should', 'such', 'y', 'above', 'hasn', 'weren', 'ma', 'am', 'further', 'them', 'aren', 'don', 'as', 'this', 'for', 'were', 'now', 'i', 'about', 'once', 'than', 'has', 'through', "hadn't", 'theirs', 'does', 'wouldn', 'when', 'their', 'doing', "couldn't", 'same', 'yourselves', 'did', 'and', "mightn't", 'yourself', 'hers', 'shan', 'her', "won't", 'there', 'being', "shouldn't", 'nor', "didn't", 'no', 'our', "it's", 'up', 'be', 'themselves', 'between', 'below', 'him', 'against', 'll', 'it', 'had', 'more', 'the', 'doesn', 'shouldn', 'he', "mustn't", 'you', 'an', 'having', 'hadn', 'won', 'again', 'isn', 'each', "don't", 'at', 'any', 'how', 'ourselves', 'was', 'been', 'so', "that'll", 'by', 'ours', 'own', 'do', 'before', "shan't", "wouldn't", 'what', "isn't", 'are', 'over', "needn't", 'mustn', 'into', 'she', 'off', 'couldn', "wasn't", 'his', '

In [55]:
# Print result
first_result = [word.lower() for word in words if word.lower() not in sw]
print(first_result)

['miti', 'expected', 'lower', 'projection', 'primary', 'energy', 'supplies', 'year', '2000', '550', 'mln', 'kilolitres', '(', 'kl', ')', '600', 'mln', ',', 'said', '.']


In [56]:
print(words)

['MITI', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for', 'primary', 'energy', 'supplies', 'in', 'the', 'year', '2000', 'to', '550', 'mln', 'kilolitres', '(', 'kl', ')', 'from', '600', 'mln', ',', 'they', 'said', '.']


In [9]:
# We can define our own list of stopwords to add to the default nltk words
sw_addon = {'said', 'mln', 'kilolitres','kl'}
second_result = [word.lower() for word in words if word.lower() not in sw.union(sw_addon)]

In [10]:
# Print result
print(second_result)

['miti', 'expected', 'lower', 'projection', 'primary', 'energy', 'supplies', 'year', '2000', '550', '(', ')', '600', ',', '.']


In [29]:
print(words)

['MITI', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for', 'primary', 'energy', 'supplies', 'in', 'the', 'year', '2000', 'to', '550', 'mln', 'kilolitres', '(', 'kl', ')', 'from', '600', 'mln', ',', 'they', 'said', '.']


## Examples of regex functions

In [79]:
txt = "The rain in Spain"
x = re.search("^The.*Spain$", txt)
if x:
  print("YES! We have a match!")
else:
  print("No match")

YES! We have a match!


In [80]:
x = re.search("\s", txt)

print("The first white-space character is located in position:", x.start())

The first white-space character is located in position: 3


In [81]:
x = re.findall("ai", txt)
print(x)

['ai', 'ai']


## Getting Rid of Non-Alpha Characters: Regex

In [11]:
# Import regular expressions library
import re

In [27]:
sentence

'MITI is expected to lower the projection for primary energy\n  supplies in the year 2000 to 550 mln kilolitres (kl) from 600\n  mln, they said.'

In [12]:
# Substitute everything that is NOT a letter with empty string
regex = re.compile("[^a-zA-Z ]")
re_clean = regex.sub('', sentence)
print(re_clean)

MITI is expected to lower the projection for primary energy  supplies in the year  to  mln kilolitres kl from   mln they said


In [13]:
# Tokenize re_clean, convert to lower case, and remove stop words 
re_words = word_tokenize(re_clean)
re_result = [word.lower() for word in re_words if word.lower() not in sw.union(sw_addon)]

# Print result
print(re_result)

['miti', 'expected', 'lower', 'projection', 'primary', 'energy', 'supplies', 'year']


In [68]:
# Substitute everything that is NOT a number with empty string
regex2 = re.compile("[^0-9 ]")
re_clean2 = regex2.sub('', sentence)
print(re_clean2)

               2000  550     600    
