# NLP with SKL

## Vectorize Documents (without scikit-learn)

In [1]:
import re

### Format Documents

In [2]:
# make some documents to work with

X = [
    'the scikit-learn, is great',
    'much better API for; the NLP than the spark MLlib',
    'we are+ learning NLP in the sci-kit',
    'is my... punctuation, is. terrible;'
]

X

['the scikit-learn, is great',
 'much better API for; the NLP than the spark MLlib',
 'we are+ learning NLP in the sci-kit',
 'is my... punctuation, is. terrible;']

In [3]:
# compile scikit-learn like formatter regex

formatter_pattern = re.compile(r'[^\w\s\']')

In [4]:
# show scikit-learn like formatter in action

X_formatted = [re.sub(formatter_pattern, '', document).lower() for document in X]
X_formatted

['the scikitlearn is great',
 'much better api for the nlp than the spark mllib',
 'we are learning nlp in the scikit',
 'is my punctuation is terrible']

### Tokenize Formatted Documents

In [5]:
# compile scikit-learn like tokenizer regex

tokenizer_pattern = re.compile(r'(?u)\b\w\w+\b')

In [6]:
# scikit-learn like show tokenization of formatted documents

X_tokenized = [tokenizer_pattern.findall(document) for document in X_formatted]
X_tokenized

[['the', 'scikitlearn', 'is', 'great'],
 ['much',
  'better',
  'api',
  'for',
  'the',
  'nlp',
  'than',
  'the',
  'spark',
  'mllib'],
 ['we', 'are', 'learning', 'nlp', 'in', 'the', 'scikit'],
 ['is', 'my', 'punctuation', 'is', 'terrible']]

### Hash tokenized documents

In [7]:
# make a function to hash documents

def hash_tokenized(tokenized_documents):
    used_token = []
    vocabulary = {}
    idx = 0

    hashed_docuements = []

    for document in tokenized_documents:

        hashed_document = []

        for token in document:

            if token in vocabulary:
                hashed_value = vocabulary[token]
            else:
                hashed_value = idx
                idx += 1
                vocabulary[token] = hashed_value

            hashed_document.append(hashed_value)

        hashed_docuements.append(hashed_document)
        
    max_idx = idx - 1

    return hashed_docuements, vocabulary, max_idx

In [8]:
# hash the tokenized documents
X_hashed, hashing_vocabulary, max_idx = hash_tokenized(X_tokenized)

In [9]:
X_hashed

[[0, 1, 2, 3],
 [4, 5, 6, 7, 0, 8, 9, 0, 10, 11],
 [12, 13, 14, 8, 15, 0, 16],
 [2, 17, 18, 2, 19]]

In [10]:
hashing_vocabulary

{'the': 0,
 'scikitlearn': 1,
 'is': 2,
 'great': 3,
 'much': 4,
 'better': 5,
 'api': 6,
 'for': 7,
 'nlp': 8,
 'than': 9,
 'spark': 10,
 'mllib': 11,
 'we': 12,
 'are': 13,
 'learning': 14,
 'in': 15,
 'scikit': 16,
 'my': 17,
 'punctuation': 18,
 'terrible': 19}

In [11]:
max_idx

19

### Convert hashed documents to maxtix

In [12]:
def hashed_to_matrix(hashed_documents, max_idx):
    
    matrix = []
    
    for hashed_document in hashed_documents:
        
        row = [0 for _ in range(0, max_idx + 1)]
        
        for hashed_token in hashed_document:
            
            row[hashed_token] += 1
        
        matrix.append(row)
    
    return matrix

In [13]:
hashed_to_matrix(X_hashed, max_idx)

[[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]

In [14]:
hashing_vocabulary

{'the': 0,
 'scikitlearn': 1,
 'is': 2,
 'great': 3,
 'much': 4,
 'better': 5,
 'api': 6,
 'for': 7,
 'nlp': 8,
 'than': 9,
 'spark': 10,
 'mllib': 11,
 'we': 12,
 'are': 13,
 'learning': 14,
 'in': 15,
 'scikit': 16,
 'my': 17,
 'punctuation': 18,
 'terrible': 19}