In [1]:
!pip install nltk
!pip install numpy matplotlib
!pip install pandas
!pip install wordcloud



In [None]:
# download relevant parts of NLTK
import nltk
nltk.download('all')

# Overview

## 1. Regular Expression

## 2. Basics about NLTK
1. Tokenization

  1.   Sentence tokenization
  2.   Word tokenization

2. Filtering Stop Words  
3.   Stemming
4.   Lemmatizing


## 3. Example of Analyzing Text

## 4. Practice



# Regular Expression in Python

In this section, we will briefly go through the basic regular experssions. For more details, please refer to https://www.datacamp.com/community/tutorials/python-regular-expression-tutorial

In [None]:
import re

## Disjunctions

In [None]:
pattern = '[wW]oodchuck'
sequence = ['Woodchuck', 'woodchuck', 'wOodchuck']

for word in sequence:
  print(word)
  if re.match(pattern, word):
    print('Match!')
  else:
    print('Not Match!')


In [None]:
pattern = '[0-9][a-z][A-Z]'
sequence = ['1sP', 'ssP', '7CS']

for word in sequence:
  print(word)
  if re.match(pattern, word):
    print('Match!')
  else:
    print('Not Match!')


## Negation in Disjunction

In [None]:
pattern = '[0-9][^a-z][A-Z]'
sequence = ['1sP', 'ssP', '7CS']

for word in sequence:
  print(word)
  if re.match(pattern, word):
    print('Match!')
  else:
    print('Not Match!')

## More Disjunction

In [None]:
pattern = '[gG]roundhog|[Ww]oodchuck'
sequence = ['groundhog', 'woodchuck', 'wOOdchuck']

for word in sequence:
  print(word)
  if re.match(pattern, word):
    print('Match!')
  else:
    print('Not Match!')

## Regular Experssion with ```? *+.$```
*italicized text*


`.` - A period. Matches any single character except the newline character.

In [None]:
# With the search function, you scan through the given string/sequence, looking for the first location where the regular expression produces a match.
# The group function returns the string matched by the re.

re.search(r'Co.k.e', 'Co\noie Cookie cookie').group()

`^` - A caret. Matches the start of the string.

In [None]:
re.search(r'^Eat', "Eat cake!").group()

In [None]:
re.search(r'^eat', "Let's eat cake!").group()

$ - Matches the end of string.

In [None]:
re.search(r'cake$', "Cake! Let's eat cake").group()

In [None]:
re.search(r'cake$', "Let's get some cake on our way home!").group()

`+` - Checks if the preceding character appears one or more times starting from that position.

In [None]:
re.search(r'Co+kie', 'Cooookie').group()

`*` - Checks if the preceding character appears zero or more times starting from that position.

In [None]:
re.search(r'Ca*o*kie', 'Cookie').group()

? - Checks if the preceding character appears exactly zero or one time starting from that position.

In [None]:
re.search(r'Colou?r', 'Color').group()

In [None]:
re.search(r'Colou?r', 'Colour').group()

In [None]:
re.search(r'Colou?r', 'Colouur').group()

# Basics about NLTK

## Tokenization

Tokenization is the first step in turning unstructured data into structured data, which is easier to analyze.

Through tokenization, you could split up text by word or by sentence, which could allow you to work with smaller pieces of text.






### Sentence tokenization


In [None]:
# import relevant parts of NLTK

from nltk.tokenize import sent_tokenize

In [None]:
example_string = """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
"""

In [None]:
# Tokenizing example_string by sentence gives you a list of three strings that are sentences:
sentences = sent_tokenize(example_string)

In [None]:
sentences

### Word tokenization

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
words = word_tokenize(sentences[1])

In [None]:
words

## Filtering Stop Words 

Stop words are a set of commonly used words in a language. Very common words like 'in', 'is', and 'an' are often used as stop words since they don’t add a lot of meaning and information to a text in and of themselves. In many cases like topic extraction, you would like to ignore and filter them out of your text when you’re processing it.

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
stop_words
# You can always design your own stop_words list depending on the task
# for example, you might want to filter out punctuations
# import string
# string.punctuation

In [None]:
words

In [None]:
filtered_list = []

for word in words:
  if word.casefold() not in stop_words:
    filtered_list.append(word.lower())

In [None]:
filtered_list

## Stemming

Stemming is a text processing task in which you reduce words to their root, which is the core part of a word. For example, the words “helping” and “helper” share the root “help.” Stemming allows you to zero in on the basic meaning of a word rather than all the details of how it’s being used. 

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

# there are also other types of stenmmer in NLTK such as porter stemmer
# from nltk.stem import PorterStemmer
# stemmer = PorterStemmer()

In [None]:
string_for_stemming = """The crew of the USS Discovery discovered many discoveries. Discovering is what explorers do."""

In [None]:
words = word_tokenize(string_for_stemming)

In [None]:
words

In [None]:
stemmed_words = []

for word in words:
  stemmed_words.append(stemmer.stem(word))

In [None]:
stemmed_words

## Lemmatizing

Like stemming, lemmatizing reduces words to their core meaning, but it will give you a complete English word that makes sense on its own instead of just a fragment of a word like 'discoveri'.

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize("discoveries")

In [None]:
string_for_lemmatizing = """The students like NLP experiments."""

In [None]:
words = word_tokenize(string_for_lemmatizing)

In [None]:
words

In [None]:
lemmatized_words = []
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word))

In [None]:
lemmatized_words

# Example of Analyzing Text

1. Building the word dictionary, 
2. Visualizing the lengths of newsgroups messages,
3. Visualizing the word frequency distribution.



In [None]:
# Importing all the related packages
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

In [None]:
# Here we use 20-Newsgroups dataset (http://qwone.com/~jason/20Newsgroups/) for this example. 
# This version of the dataset contains about 11k newsgroups posts from 20 different topics. 
# This is available as https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json

raw_data = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(raw_data.target_names.unique())

In [None]:
raw_data

In [None]:
# let's use tweets that are classified as rec.sport.baseball as an example
text = []
for i in range(0, len(raw_data['content'])):
  if raw_data['target_names'][i] == 'rec.sport.baseball':
    text.append(raw_data['content'][i])

In [None]:
len(text)

In [None]:
text[0]

## Preprocessing and Tokenization

In [None]:
# Tokenizing
tokenized_text = []
for sentence in text:
  tokenized_text.append(word_tokenize(sentence))

In [None]:
tokenized_text[0]

## Filtering Stop Words and Punctuations, and Lemmatizing

In [None]:
# Define the stop word set
stop_words = stopwords.words("english")
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'line', "'s", "n't", "'d"])

In [None]:
# Define the punctuation set
punctuations = string.punctuation  + "*" + "/" + "\\" + "_" + "-"

In [None]:
# Define the lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
filtered_text = []

for sent in tokenized_text:
  filtered_list = []
  for word in sent:
    # filter out tokens that have punctuations and numbers
      # word.isalpha() returns true if a string only contains letters.
    # filter out stop words
    if word.isalpha() and lemmatizer.lemmatize(word.lower()) not in stop_words:
      filtered_list.append(lemmatizer.lemmatize(word.lower()))
  filtered_text.append(filtered_list)

In [None]:
filtered_text[0]

## Building Word Dictionary

This step will build the word dictionary and then you could use the dictionary to transform text into ids or one-hot vectors for future tasks like classification

In [None]:
# Build the word dictionary
word2id = {}
id2word = {}

word_id = 0
for sent in filtered_text:
  for word in sent:
    if word not in word2id:
      word2id[word] = word_id
      id2word[word_id] = word
      word_id += 1

In [None]:
# the number of different word
len(word2id)

In [None]:
word2id['tell']

In [None]:
# representing text in ids
word_ids = []
for word in filtered_text[10]:
  word_ids.append(word2id[word])

In [None]:
word_ids

In [None]:
# translating ids back to text
sentence = []
for word_id in word_ids:
  sentence.append(id2word[word_id])

In [None]:
sentence

## Visualzing News Lengths

In [None]:
news_length = []

for sent in filtered_text:
  news_length.append(len(sent))

In [None]:
# Average lenths
sum(news_length)/len(news_length)

In [None]:
# visualizing the lengths distribution

plt.hist(news_length)

## Visualizing the Word Frequency

In [None]:
from collections import Counter

words = []
for sent in filtered_text:
  for word in sent:
    words.append(word)

word_counts = Counter(words)

In [None]:
# checking the most common words in hate speech tweets
# You could find thet most common words in hate speech tweets are some racist, sexist, homophobic, and offensive words
word_counts.most_common(20)

In [None]:
# Checking the least common words in hate speech tweets
word_counts.most_common(len(word_counts))[-20:]

In [None]:
# Plot the word cloud image
from wordcloud import WordCloud
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
wordcloud.generate(','.join(words))
wordcloud.to_image()

# Practice

Let's take a look at news that belongs to other categories (e.g., sci.med).

1. What are the numbers of distinct words in messages that belongs to other categories (e.g., sci.med)?
2. What are the average lengths of those messages? Are the length distributions different from baseball messages?
3. What are the most common words and least common words in messages that belongs to other categories? Are they different from baseball messages?



In [None]:
med_text = []
for i in range(0, len(raw_data['content'])):
  if raw_data['target_names'][i] == 'sci.med':
    med_text.append(raw_data['content'][i])

In [None]:
len(med_text)