# NLP with SKL

## Vectorize Documents (without scikit-learn)

In [1]:
import re

### Format Documents

In [2]:
# make some documents to work with

X = [
    'the scikit-learn, is great',
    'much better API for; the NLP than the spark MLlib',
    'we are+ learning NLP in the scikit-learn',
    'is my... punctuation, is. terrible;'
]

X

['the scikit-learn, is great',
 'much better API for; the NLP than the spark MLlib',
 'we are+ learning NLP in the scikit-learn',
 'is my... punctuation, is. terrible;']

In [3]:
# compile scikit-learn like formatter regex

formatter_pattern = re.compile(r'[^\w\s\']')

In [4]:
# show scikit-learn like formatter in action

X_formatted = [re.sub(formatter_pattern, '', document).lower() for document in X]
X_formatted

['the scikitlearn is great',
 'much better api for the nlp than the spark mllib',
 'we are learning nlp in the scikitlearn',
 'is my punctuation is terrible']

### Tokenize Formatted Documents

In [5]:
# compile scikit-learn like tokenizer regex

tokenizer_pattern = re.compile(r'(?u)\b\w\w+\b')

In [6]:
# scikit-learn like show tokenization of formatted documents

X_tokenized = [tokenizer_pattern.findall(document) for document in X_formatted]
X_tokenized

[['the', 'scikitlearn', 'is', 'great'],
 ['much',
  'better',
  'api',
  'for',
  'the',
  'nlp',
  'than',
  'the',
  'spark',
  'mllib'],
 ['we', 'are', 'learning', 'nlp', 'in', 'the', 'scikitlearn'],
 ['is', 'my', 'punctuation', 'is', 'terrible']]

### Hash tokenized documents

In [7]:
# make a function to hash documents

def hash_tokenized(tokenized_documents):
    used_token = []
    vocabulary = {}
    idx = 0

    hashed_docuements = []

    for document in tokenized_documents:

        hashed_document = []

        for token in document:

            if token in vocabulary:
                hashed_value = vocabulary[token]
            else:
                hashed_value = idx
                idx += 1
                vocabulary[token] = hashed_value

            hashed_document.append(hashed_value)

        hashed_docuements.append(hashed_document)
        
    max_idx = idx - 1

    return hashed_docuements, vocabulary, max_idx

In [8]:
# hash the tokenized documents
X_hashed, hashing_vocabulary, max_idx = hash_tokenized(X_tokenized)

In [9]:
X_hashed

[[0, 1, 2, 3],
 [4, 5, 6, 7, 0, 8, 9, 0, 10, 11],
 [12, 13, 14, 8, 15, 0, 1],
 [2, 16, 17, 2, 18]]

In [10]:
hashing_vocabulary

{'the': 0,
 'scikitlearn': 1,
 'is': 2,
 'great': 3,
 'much': 4,
 'better': 5,
 'api': 6,
 'for': 7,
 'nlp': 8,
 'than': 9,
 'spark': 10,
 'mllib': 11,
 'we': 12,
 'are': 13,
 'learning': 14,
 'in': 15,
 'my': 16,
 'punctuation': 17,
 'terrible': 18}

In [11]:
max_idx

18

### Convert hashed documents to maxtix

In [12]:
def hashed_to_matrix(hashed_documents, max_idx):
    
    matrix = []
    
    for hashed_document in hashed_documents:
        
        row = [0 for _ in range(0, max_idx + 1)]
        
        for hashed_token in hashed_document:
            
            row[hashed_token] += 1
        
        matrix.append(row)
    
    return matrix

In [13]:
hashed_to_matrix(X_hashed, max_idx)

[[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]

In [14]:
hashing_vocabulary

{'the': 0,
 'scikitlearn': 1,
 'is': 2,
 'great': 3,
 'much': 4,
 'better': 5,
 'api': 6,
 'for': 7,
 'nlp': 8,
 'than': 9,
 'spark': 10,
 'mllib': 11,
 'we': 12,
 'are': 13,
 'learning': 14,
 'in': 15,
 'my': 16,
 'punctuation': 17,
 'terrible': 18}

## Vertorize Docuements (w scikit-learn)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# create an instance of the CountVectorizer class

cv = CountVectorizer()

In [17]:
# fit it

cv.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [18]:
# see the vocabulary

cv.vocabulary_

{'the': 18,
 'scikit': 14,
 'learn': 7,
 'is': 6,
 'great': 4,
 'much': 10,
 'better': 2,
 'api': 0,
 'for': 3,
 'nlp': 12,
 'than': 17,
 'spark': 15,
 'mllib': 9,
 'we': 19,
 'are': 1,
 'learning': 8,
 'in': 5,
 'my': 11,
 'punctuation': 13,
 'terrible': 16}

In [19]:
# see the sparse matrix

X_sparse = cv.transform(X)
print(X_sparse)

  (0, 4)	1
  (0, 6)	1
  (0, 7)	1
  (0, 14)	1
  (0, 18)	1
  (1, 0)	1
  (1, 2)	1
  (1, 3)	1
  (1, 9)	1
  (1, 10)	1
  (1, 12)	1
  (1, 15)	1
  (1, 17)	1
  (1, 18)	2
  (2, 1)	1
  (2, 5)	1
  (2, 7)	1
  (2, 8)	1
  (2, 12)	1
  (2, 14)	1
  (2, 18)	1
  (2, 19)	1
  (3, 6)	2
  (3, 11)	1
  (3, 13)	1
  (3, 16)	1


In [20]:
# see the dense matrix

X_dense = X_sparse.todense()
X_dense

matrix([[0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
        [1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 2, 0],
        [0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1],
        [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0]])

## Vertorize Docuements Ignoring Stop Words (w scikit-learn)

In [21]:
# see the english stopwords

from sklearn.feature_extraction import stop_words 
print(stop_words.ENGLISH_STOP_WORDS)

frozenset({'when', 'becoming', 'was', 'out', 'anyhow', 'ie', 'has', 'throughout', 'because', 'between', 'together', 'former', 'ours', 'so', 'while', 'she', 'third', 'each', 'ltd', 'become', 'will', 'seems', 'own', 'whither', 'we', 'who', 'someone', 'besides', 'three', 'made', 'see', 'herself', 'anything', 'since', 'in', 'off', 'noone', 'us', 'couldnt', 'cant', 'keep', 'them', 'neither', 'down', 'before', 'amoungst', 'wherein', 'myself', 'anywhere', 'very', 'next', 'five', 'least', 'therein', 'describe', 'serious', 'name', 'further', 'what', 'behind', 'if', 'thin', 'my', 'sometimes', 'inc', 'anyway', 'fill', 'hundred', 'where', 'how', 'con', 'among', 'eleven', 'seem', 'eg', 'thereby', 'of', 'well', 'de', 'often', 'never', 'forty', 'along', 'can', 'her', 'without', 'otherwise', 'two', 'therefore', 'somewhere', 'about', 'however', 'top', 'such', 'against', 'rather', 're', 'mostly', 'your', 'above', 'most', 'our', 'by', 'that', 'mine', 'moreover', 'system', 'beforehand', 'sometime', 'been'

In [22]:
# create an instance of the CountVectorizer class, have it ignore the english stop words

cv = CountVectorizer(stop_words='english')

In [23]:
# fit it

cv.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [24]:
# see the vocabulary

cv.vocabulary_

{'scikit': 8,
 'learn': 3,
 'great': 2,
 'better': 1,
 'api': 0,
 'nlp': 6,
 'spark': 9,
 'mllib': 5,
 'learning': 4,
 'punctuation': 7,
 'terrible': 10}

In [25]:
# see the sparse matrix

X_sparse = cv.transform(X)
print(X_sparse)

  (0, 2)	1
  (0, 3)	1
  (0, 8)	1
  (1, 0)	1
  (1, 1)	1
  (1, 5)	1
  (1, 6)	1
  (1, 9)	1
  (2, 3)	1
  (2, 4)	1
  (2, 6)	1
  (2, 8)	1
  (3, 7)	1
  (3, 10)	1


In [26]:
# see the dense matrix

X_dense = X_sparse.todense()
X_dense

matrix([[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0],
        [1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0],
        [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]])

## Vertorize Docuements with TF-IDF Ignoring Stop Words (w scikit-learn)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
# create an instance of the TfidfVectorizer class, have it ignore the english stop words

tfidf = TfidfVectorizer(
    stop_words='english',
)

In [29]:
# do fit and transform at the same time and show the sparse matrix

X_sparse = tfidf.fit_transform(X)
print(X_sparse)

  (0, 8)	0.5264054336099155
  (0, 3)	0.5264054336099155
  (0, 2)	0.6676785446095399
  (1, 1)	0.4651619335222394
  (1, 0)	0.4651619335222394
  (1, 6)	0.3667390112974172
  (1, 9)	0.4651619335222394
  (1, 5)	0.4651619335222394
  (2, 8)	0.4658085493691629
  (2, 3)	0.4658085493691629
  (2, 6)	0.4658085493691629
  (2, 4)	0.5908190806023349
  (3, 7)	0.7071067811865476
  (3, 10)	0.7071067811865476


In [30]:
# see the vocabulary

tfidf.vocabulary_

{'scikit': 8,
 'learn': 3,
 'great': 2,
 'better': 1,
 'api': 0,
 'nlp': 6,
 'spark': 9,
 'mllib': 5,
 'learning': 4,
 'punctuation': 7,
 'terrible': 10}

In [31]:
# see the dense matrix

X_dense = X_sparse.todense()
X_dense

matrix([[0.        , 0.        , 0.66767854, 0.52640543, 0.        ,
         0.        , 0.        , 0.        , 0.52640543, 0.        ,
         0.        ],
        [0.46516193, 0.46516193, 0.        , 0.        , 0.        ,
         0.46516193, 0.36673901, 0.        , 0.        , 0.46516193,
         0.        ],
        [0.        , 0.        , 0.        , 0.46580855, 0.59081908,
         0.        , 0.46580855, 0.        , 0.46580855, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.70710678, 0.        , 0.        ,
         0.70710678]])