# LIS 875 Text Mining: Week 05 (processing text data)
* Python String and text basics
* Regular Expression
* Using Spacy for text normalization & parsing

## Python String and Text Basics

In [1]:
# define a text variable
# either ' or " are fine

text = "Welcome to LIS 875!"
text

'Welcome to LIS 875!'

In [2]:
# a text is just a sequence of characters (including whitespace and other punctuations)
characters = [char for char in text]
characters

['W',
 'e',
 'l',
 'c',
 'o',
 'm',
 'e',
 ' ',
 't',
 'o',
 ' ',
 'L',
 'I',
 'S',
 ' ',
 '8',
 '7',
 '5',
 '!']

In [3]:
# lowercase
text.lower()

'welcome to lis 875!'

In [4]:
# uppercase
text.upper()

'WELCOME TO LIS 875!'

In [5]:
# How to do title case? I can't remember! Let's just Google it!
text.title()

'Welcome To Lis 875!'

In [6]:
# Tip: programming suggestion may help a lot (at least you don't need to remember every function names)
# If you using a plain Jupyter notebook, you may need to press Tab to activate the suggestions.

# put a . after text to activate suggestions in Google colab
text.title?

In [None]:
# Tip: use ? for documentation

In [7]:
# concatenate strings
'Welcome' + 'to' + 'LIS' + '501' + '!'

'WelcometoLIS501!'

In [8]:
# concatenate a list of strings with a separator
' '.join(['Welcome', 'to', 'LIS', '875', '!'])

'Welcome to LIS 875 !'

In [9]:
# repeat a string
'abc'*3

'abcabcabc'

In [10]:
text

'Welcome to LIS 875!'

In [11]:
# concat string and other objects -- casting them to string first (otherwise it throws an exception)
text + str(12)

'Welcome to LIS 875!12'

In [12]:
# length of a string (how many characters)
print(text)
len(text)

Welcome to LIS 875!


19

In [13]:
# get characters from a text
text[7]

' '

In [14]:
# string indexing: -len to (len-1)
text[0:3]

'Wel'

In [15]:
# negative indexing is equivalent to len(text) - negative_number
text[0:-3]

'Welcome to LIS 8'

In [16]:
# split text by a separator
text.split()

['Welcome', 'to', 'LIS', '875!']

## In-class Exercise 1

In [17]:
name = 'Peter Pan'

In [18]:
# Class Exercise: get first & last name initials
# For example, if name = 'Peter Pan', you should get 'PP'

# step 1: split it
# you should aim to get [ 'First', 'Last' ]
name.split()

['Peter', 'Pan']

In [19]:
# step 2: get the first letter of each part
# you should aim to get [ 'F', 'L' ]
[n[0] for n in name.split()]

['P', 'P']

In [20]:
# step 3: concat them together
# you should aim to get 'FL'

''.join([n[0] for n in name.split()])

'PP'

## Regular Expression

Just some basic examples; more tutorial can be found at: https://www.w3schools.com/python/python_regex.asp

In [21]:
import re

text = "Welcome to LIS 875!"

# [A-Z] matches any uppercase character
# [A-Z]+ matches any uppercase character appearing one to many times
# found all substrings that are all upercase
re.findall("[A-Z]+", text)

['W', 'LIS']

In [22]:
# [0-9] matches any digit
# [0-9]+ matches any digit appearing one to many times
# found all substrings that are all digits

re.findall("[0-9]+", text)

['875']

In [23]:
# . matches any character
# find all substrings that starts with a 'W' and ends with an 'o'; "greedy" match, match as much text as possible

print(text)
re.findall("W.+o", text)

Welcome to LIS 875!


['Welcome to']

In [24]:
# found all substrings that starts with a 'W' and ends with an 'o'; "reluctant" match, match as few text as possible

re.findall("W.+?o", text)

['Welco']

In [25]:
# \s matches any whitespace
 # split by 1 or more whitespace
re.split("\s+", text)

['Welcome', 'to', 'LIS', '875!']

In [26]:
# replace 1 or more whitespace by [WHITESPACE]

re.sub("\s+", '[WHITESPACE]', text)

'Welcome[WHITESPACE]to[WHITESPACE]LIS[WHITESPACE]875!'

## Using Spacy for Text Normalization and Parsing
- Sentence segmentation, tokenization, stop words removal, lemmatization (stemming)
- POS tagging, NP chunking, Named Entity Recognition
- https://spacy.io/

In [1]:
# make sure the required python packages are installed

# install nltk (we'll use 3.6.7 in Fall 2022)
!pip install nltk==3.6.7 --upgrade

# install spacy (we'll use 3.2.1 in Fall 2022)
!pip install spacy==3.2.1 --upgrade

# download the spacy en_core_web_sm model (3.2.0 version)
!python -m spacy download en_core_web_sm-3.2.0 --direct

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 18.5 MB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.4.0
    Uninstalling en-core-web-sm-3.4.0:
      Successfully uninstalled en-core-web-sm-3.4.0
Successfully installed en-core-web-sm-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
#import spacy and en_core_web_sm
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

In [3]:
#raw text
raw = "Netanyahu's visit was cut short by reports late Sunday that a rocket was fired from Gaza into central Israel, wounding at least seven people. Following criticism from political opponents over what they consider the prime minister's unclear stance toward the militant political group, Israel responded with a series of strikes into Gaza against Hamas, which largely governs the contested strip. President Donald Trump tacitly endorsed the strike following his meetings with Netanyahu, calling the Hamas attack \"despicable.\""

text = nlp(raw)

In [4]:
#print(text)
print(text)

Netanyahu's visit was cut short by reports late Sunday that a rocket was fired from Gaza into central Israel, wounding at least seven people. Following criticism from political opponents over what they consider the prime minister's unclear stance toward the militant political group, Israel responded with a series of strikes into Gaza against Hamas, which largely governs the contested strip. President Donald Trump tacitly endorsed the strike following his meetings with Netanyahu, calling the Hamas attack "despicable."


In [5]:
# sentence segmentation
sentences = list(text.sents)
sentences[0]

Netanyahu's visit was cut short by reports late Sunday that a rocket was fired from Gaza into central Israel, wounding at least seven people.

In [6]:
# tokenization

[token.text for token in sentences[0]] # tokens in the first sentence

['Netanyahu',
 "'s",
 'visit',
 'was',
 'cut',
 'short',
 'by',
 'reports',
 'late',
 'Sunday',
 'that',
 'a',
 'rocket',
 'was',
 'fired',
 'from',
 'Gaza',
 'into',
 'central',
 'Israel',
 ',',
 'wounding',
 'at',
 'least',
 'seven',
 'people',
 '.']

In [7]:
# just in case you are not familiar with list comprehension

# equivalent for loop

tokens = []
for sent in text.sents:
    for token in sent:
        tokens.append(token.text)

[token.text for sent in text.sents for token in sent ] # tokens in all the sentences; list comprehension

['Netanyahu',
 "'s",
 'visit',
 'was',
 'cut',
 'short',
 'by',
 'reports',
 'late',
 'Sunday',
 'that',
 'a',
 'rocket',
 'was',
 'fired',
 'from',
 'Gaza',
 'into',
 'central',
 'Israel',
 ',',
 'wounding',
 'at',
 'least',
 'seven',
 'people',
 '.',
 'Following',
 'criticism',
 'from',
 'political',
 'opponents',
 'over',
 'what',
 'they',
 'consider',
 'the',
 'prime',
 'minister',
 "'s",
 'unclear',
 'stance',
 'toward',
 'the',
 'militant',
 'political',
 'group',
 ',',
 'Israel',
 'responded',
 'with',
 'a',
 'series',
 'of',
 'strikes',
 'into',
 'Gaza',
 'against',
 'Hamas',
 ',',
 'which',
 'largely',
 'governs',
 'the',
 'contested',
 'strip',
 '.',
 'President',
 'Donald',
 'Trump',
 'tacitly',
 'endorsed',
 'the',
 'strike',
 'following',
 'his',
 'meetings',
 'with',
 'Netanyahu',
 ',',
 'calling',
 'the',
 'Hamas',
 'attack',
 '"',
 'despicable',
 '.',
 '"']

In [8]:
# all tokens in the text

[token.text for token in text]

['Netanyahu',
 "'s",
 'visit',
 'was',
 'cut',
 'short',
 'by',
 'reports',
 'late',
 'Sunday',
 'that',
 'a',
 'rocket',
 'was',
 'fired',
 'from',
 'Gaza',
 'into',
 'central',
 'Israel',
 ',',
 'wounding',
 'at',
 'least',
 'seven',
 'people',
 '.',
 'Following',
 'criticism',
 'from',
 'political',
 'opponents',
 'over',
 'what',
 'they',
 'consider',
 'the',
 'prime',
 'minister',
 "'s",
 'unclear',
 'stance',
 'toward',
 'the',
 'militant',
 'political',
 'group',
 ',',
 'Israel',
 'responded',
 'with',
 'a',
 'series',
 'of',
 'strikes',
 'into',
 'Gaza',
 'against',
 'Hamas',
 ',',
 'which',
 'largely',
 'governs',
 'the',
 'contested',
 'strip',
 '.',
 'President',
 'Donald',
 'Trump',
 'tacitly',
 'endorsed',
 'the',
 'strike',
 'following',
 'his',
 'meetings',
 'with',
 'Netanyahu',
 ',',
 'calling',
 'the',
 'Hamas',
 'attack',
 '"',
 'despicable',
 '.',
 '"']

In [9]:
#Lemmatization
sentences[0][13].lemma_

'be'

In [10]:
text

Netanyahu's visit was cut short by reports late Sunday that a rocket was fired from Gaza into central Israel, wounding at least seven people. Following criticism from political opponents over what they consider the prime minister's unclear stance toward the militant political group, Israel responded with a series of strikes into Gaza against Hamas, which largely governs the contested strip. President Donald Trump tacitly endorsed the strike following his meetings with Netanyahu, calling the Hamas attack "despicable."

In [11]:
# a list of tokens in the first sentence and if they are stop words
[ (token.text, token.is_stop) for token in sentences[0]]

[('Netanyahu', False),
 ("'s", True),
 ('visit', False),
 ('was', True),
 ('cut', False),
 ('short', False),
 ('by', True),
 ('reports', False),
 ('late', False),
 ('Sunday', False),
 ('that', True),
 ('a', True),
 ('rocket', False),
 ('was', True),
 ('fired', False),
 ('from', True),
 ('Gaza', False),
 ('into', True),
 ('central', False),
 ('Israel', False),
 (',', False),
 ('wounding', False),
 ('at', True),
 ('least', True),
 ('seven', False),
 ('people', False),
 ('.', False)]

In [12]:
# only listing the tokens that are not stop words in sentences[0]

[token.text for token in sentences[0] if not token.is_stop]

['Netanyahu',
 'visit',
 'cut',
 'short',
 'reports',
 'late',
 'Sunday',
 'rocket',
 'fired',
 'Gaza',
 'central',
 'Israel',
 ',',
 'wounding',
 'seven',
 'people',
 '.']

In [13]:
# only listing the tokens that are not stop words or punctuation
[token.text for token in sentences[0] if not token.is_stop and not token.is_punct]

['Netanyahu',
 'visit',
 'cut',
 'short',
 'reports',
 'late',
 'Sunday',
 'rocket',
 'fired',
 'Gaza',
 'central',
 'Israel',
 'wounding',
 'seven',
 'people']

In [14]:
# list the lowercased tokens and lemma
[ (token.text, token.lower_, token.lemma_) for token in sentences[0] if not token.is_stop and not token.is_punct]

[('Netanyahu', 'netanyahu', 'Netanyahu'),
 ('visit', 'visit', 'visit'),
 ('cut', 'cut', 'cut'),
 ('short', 'short', 'short'),
 ('reports', 'reports', 'report'),
 ('late', 'late', 'late'),
 ('Sunday', 'sunday', 'Sunday'),
 ('rocket', 'rocket', 'rocket'),
 ('fired', 'fired', 'fire'),
 ('Gaza', 'gaza', 'Gaza'),
 ('central', 'central', 'central'),
 ('Israel', 'israel', 'Israel'),
 ('wounding', 'wounding', 'wound'),
 ('seven', 'seven', 'seven'),
 ('people', 'people', 'people')]

In [15]:
# just in case if you are interested in using Porter Stemming

from nltk.stem.porter import *
porter = PorterStemmer()

[ (token.text, token.lower_, token.lemma_, porter.stem(token.text)) for token in sentences[1] if not token.is_stop and not token.is_punct]

[('Following', 'following', 'follow', 'follow'),
 ('criticism', 'criticism', 'criticism', 'critic'),
 ('political', 'political', 'political', 'polit'),
 ('opponents', 'opponents', 'opponent', 'oppon'),
 ('consider', 'consider', 'consider', 'consid'),
 ('prime', 'prime', 'prime', 'prime'),
 ('minister', 'minister', 'minister', 'minist'),
 ('unclear', 'unclear', 'unclear', 'unclear'),
 ('stance', 'stance', 'stance', 'stanc'),
 ('militant', 'militant', 'militant', 'milit'),
 ('political', 'political', 'political', 'polit'),
 ('group', 'group', 'group', 'group'),
 ('Israel', 'israel', 'Israel', 'israel'),
 ('responded', 'responded', 'respond', 'respond'),
 ('series', 'series', 'series', 'seri'),
 ('strikes', 'strikes', 'strike', 'strike'),
 ('Gaza', 'gaza', 'Gaza', 'gaza'),
 ('Hamas', 'hamas', 'Hamas', 'hama'),
 ('largely', 'largely', 'largely', 'larg'),
 ('governs', 'governs', 'govern', 'govern'),
 ('contested', 'contested', 'contest', 'contest'),
 ('strip', 'strip', 'strip', 'strip')]

In [16]:
# print each token and its part-of-speech (POS) tags in sentences[0]

[(token.text, token.tag_) for token in sentences[0]]

[('Netanyahu', 'NNP'),
 ("'s", 'POS'),
 ('visit', 'NN'),
 ('was', 'VBD'),
 ('cut', 'VBN'),
 ('short', 'JJ'),
 ('by', 'IN'),
 ('reports', 'NNS'),
 ('late', 'JJ'),
 ('Sunday', 'NNP'),
 ('that', 'IN'),
 ('a', 'DT'),
 ('rocket', 'NN'),
 ('was', 'VBD'),
 ('fired', 'VBN'),
 ('from', 'IN'),
 ('Gaza', 'NNP'),
 ('into', 'IN'),
 ('central', 'JJ'),
 ('Israel', 'NNP'),
 (',', ','),
 ('wounding', 'VBG'),
 ('at', 'IN'),
 ('least', 'RBS'),
 ('seven', 'CD'),
 ('people', 'NNS'),
 ('.', '.')]

In [17]:
# print each noun phrase (NP) and its starting & ending token in sentences[0]

[(np.text, np.start, np.end) for np in sentences[0].noun_chunks]

[("Netanyahu's visit", 0, 3),
 ('reports', 7, 8),
 ('a rocket', 11, 13),
 ('Gaza', 16, 17),
 ('central Israel', 18, 20),
 ('at least seven people', 22, 26)]

In [18]:
# print each entity, its entity type, and its starting & ending token in sentences[0]
# GPE stands for Geo-Political Entity
# CARDINAL stands for Cardinal Entity

[(ent.text, ent.label_, ent.start, ent.end ) for ent in sentences[0].ents]

[('Netanyahu', 'PERSON', 0, 1),
 ('late Sunday', 'DATE', 8, 10),
 ('Gaza', 'GPE', 16, 17),
 ('Israel', 'GPE', 19, 20),
 ('at least seven', 'CARDINAL', 22, 25)]

In [19]:
# you can also get the IOB-style entity tags for each token
[(token.text, token.ent_iob_, token.ent_type_) for token in sentences[0]]

[('Netanyahu', 'B', 'PERSON'),
 ("'s", 'O', ''),
 ('visit', 'O', ''),
 ('was', 'O', ''),
 ('cut', 'O', ''),
 ('short', 'O', ''),
 ('by', 'O', ''),
 ('reports', 'O', ''),
 ('late', 'B', 'DATE'),
 ('Sunday', 'I', 'DATE'),
 ('that', 'O', ''),
 ('a', 'O', ''),
 ('rocket', 'O', ''),
 ('was', 'O', ''),
 ('fired', 'O', ''),
 ('from', 'O', ''),
 ('Gaza', 'B', 'GPE'),
 ('into', 'O', ''),
 ('central', 'O', ''),
 ('Israel', 'B', 'GPE'),
 (',', 'O', ''),
 ('wounding', 'O', ''),
 ('at', 'B', 'CARDINAL'),
 ('least', 'I', 'CARDINAL'),
 ('seven', 'I', 'CARDINAL'),
 ('people', 'O', ''),
 ('.', 'O', '')]

## In-class Exercise 2
* Counting the most frequent nouns (NN, NNP, NNS, or NNPS) in the provided text.

In [20]:
raw = "Netanyahu's visit was cut short by reports late Sunday that a rocket was fired from Gaza into central Israel, wounding at least seven people. Following criticism from political opponents over what they consider the prime minister's unclear stance toward the militant political group, Israel responded with a series of strikes into Gaza against Hamas, which largely governs the contested strip. President Donald Trump tacitly endorsed the strike following his meetings with Netanyahu, calling the Hamas attack \"despicable.\""

text = nlp(raw)

In [21]:
from collections import Counter

# your solution

# write a list comprehension [] for all tokens' lemma
# use Counter([ your list comprehension ]).most_common()

Counter( [ t.lower_ for t in text if t.tag_ in ['NN', 'NNP', 'NNS', 'NNPS'] ] ).most_common()

[('netanyahu', 2),
 ('gaza', 2),
 ('israel', 2),
 ('hamas', 2),
 ('visit', 1),
 ('reports', 1),
 ('sunday', 1),
 ('rocket', 1),
 ('people', 1),
 ('criticism', 1),
 ('opponents', 1),
 ('minister', 1),
 ('stance', 1),
 ('group', 1),
 ('series', 1),
 ('strikes', 1),
 ('strip', 1),
 ('president', 1),
 ('donald', 1),
 ('trump', 1),
 ('strike', 1),
 ('meetings', 1),
 ('attack', 1)]