# Tokenization

## Sentence tokenization

In [3]:
# import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jiwanhwang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from nltk import sent_tokenize
text_sample = 'Our research focuses on a new method for assessing and measuring organizationl culture. \
We used big data processing to mine the ubiquitous digital traces of culture in electronic communications. \
By studying the language employees use in these communications, we can measure how culture actually influences thier thoughts and behavioral at work.'
sentences = sent_tokenize(text=text_sample)
print(type(sentences), len(sentences))
print(sentences)
# text_sample is from Harvad Business Review Jan-Feb 2020. 

<class 'list'> 3
['Our research focuses on a new method for assessing and measuring organizationl culture.', 'We used big data processing to mine the ubiquitous digital traces of culture in electronic communications.', 'By studying the language employees use in these communications, we can measure how culture actually influences thier thoughts and behavioral at work.']


## Word tokenization
### separate with space, '.', ',', and etc

In [5]:
from nltk import word_tokenize

sentence = "Our research focuses on a new method for assessing and measuring organizationl culture."
words = word_tokenize(sentence)
print(type(words), len(words))
print(words)

<class 'list'> 14
['Our', 'research', 'focuses', 'on', 'a', 'new', 'method', 'for', 'assessing', 'and', 'measuring', 'organizationl', 'culture', '.']


## Combined (sent + word tokenization)

In [6]:
from nltk import word_tokenize, sent_tokenize

def tokenize_text(text):
    
    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

In [7]:
word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['Our', 'research', 'focuses', 'on', 'a', 'new', 'method', 'for', 'assessing', 'and', 'measuring', 'organizationl', 'culture', '.'], ['We', 'used', 'big', 'data', 'processing', 'to', 'mine', 'the', 'ubiquitous', 'digital', 'traces', 'of', 'culture', 'in', 'electronic', 'communications', '.'], ['By', 'studying', 'the', 'language', 'employees', 'use', 'in', 'these', 'communications', ',', 'we', 'can', 'measure', 'how', 'culture', 'actually', 'influences', 'thier', 'thoughts', 'and', 'behavioral', 'at', 'work', '.']]


# Stop words removal

In [8]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiwanhwang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
print('Number of stop words (English):', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

Number of stop words (English): 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [11]:
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []

for sentence in word_tokens:
    
    filtered_words=[]
    for word in sentence:
        word = word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    
    all_tokens.append(filtered_words)

print(all_tokens)

[['research', 'focuses', 'new', 'method', 'assessing', 'measuring', 'organizationl', 'culture', '.'], ['used', 'big', 'data', 'processing', 'mine', 'ubiquitous', 'digital', 'traces', 'culture', 'electronic', 'communications', '.'], ['studying', 'language', 'employees', 'use', 'communications', ',', 'measure', 'culture', 'actually', 'influences', 'thier', 'thoughts', 'behavioral', 'work', '.']]


# Stemming & Lemmatization

## Stemmer

In [12]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


## Lemmatizer

In [14]:
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jiwanhwang/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [15]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v'))
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

amuse amuse amuse
happy happy
fancy fancy


# Bag of Words

### CountVertorizer and TfidfVectorizer

## Sparse Matrix

### COO (Coordinate)

In [19]:
# practice
import numpy as np

dense = np.array([[3,0,1], [0,2,0]])

In [20]:
from scipy import sparse

data = np.array([3, 1, 2]) # non-zero data

row_pos = np.array([0, 0, 1])
col_pos = np.array([0, 2, 1])

sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))

In [21]:
sparse_coo.toarray()

array([[3, 0, 1],
       [0, 2, 0]])

### CSS (Compressed Sparse Row)

In [22]:
from scipy import sparse

dense2 = np.array([[0,0,1,0,0,5],
                   [1,4,0,3,2,5],
                   [0,6,0,3,0,0],
                   [2,0,0,0,0,0],
                   [0,0,0,7,0,8],
                   [1,0,0,0,0,0]])

data2 = np.array([1,5,1,4,3,2,5,6,3,2,7,8,1])

row_pos = np.array([0,0,1,1,1,1,1,2,2,3,4,4,5])
col_pos = np.array([2,5,0,1,3,4,5,1,3,0,3,5,0])

sparse_coo = sparse.coo_matrix((data2, (row_pos, col_pos)))

row_pos_ind = np.array([0,2,7,9,10,12,13])

## CSR
sparse_csr = sparse.csr_matrix((data2, col_pos, row_pos_ind))

In [24]:
print("COO")
print(sparse_coo.toarray())
print("CSR")
print(sparse_csr.toarray())

COO
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
CSR
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]


#### *** practical use ***

In [25]:
dense3 = np.array([[0,0,1,0,0,5],
                   [1,4,0,3,2,5],
                   [0,6,0,3,0,0],
                   [2,0,0,0,0,0],
                   [0,0,0,7,0,8],
                   [1,0,0,0,0,0]])

coo = sparse.coo_matrix(dense3)
csr = sparse.csr_matrix(dense3)

In [26]:
print(csr)

  (0, 2)	1
  (0, 5)	5
  (1, 0)	1
  (1, 1)	4
  (1, 3)	3
  (1, 4)	2
  (1, 5)	5
  (2, 1)	6
  (2, 3)	3
  (3, 0)	2
  (4, 3)	7
  (4, 5)	8
  (5, 0)	1
