# Feature Engineering on Text Data

In [2]:
import numpy as np
import pandas as pd
import re
import nltk

# Collecting data 
# Feature variable
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'
         ]

# Label variable classifying the observations into two categories viz., weather, animal
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 'Category': labels})
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animals


In [4]:
wpt = nltk.WordPunctTokenizer()

In [6]:
stop_words = nltk.corpus.stopwords.words('english')

In [19]:
# Download stop words
nltk.download('stopwords')

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # remove special characters
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
    # lower case
    doc = doc.lower()
    # remove whitespaces
    doc = doc.strip()
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    
    # filter stop words out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    
    return doc

# The np.vectorize(...) function helps us run the same function over all elements of a numpy array instead of writing a loop
normalize_corpus = np.vectorize(normalize_document)

# You will see that each document is in the lowercase, 
# special symbols have been removed and 
# stopwords (words which carry little meaning like articles, pronouns, etc.) have been removed.
norm_corpus = normalize_corpus(corpus)
norm_corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

### Bag of Words Model

It **vectorizes** features from unstructured text. The **dimension or size of each vector is N** where **N** indicates **all possible distinct words across the corpus of documents**. Each document once transformed is a numeric vector of size N where the **values or weights** in the vector indicate the **frequency of each word** in that specific document.

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
# Generate features using categorical values of features
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [22]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,1,1,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,1,0,1,0
2,0,0,1,1,1,1,1,0,1,0,0
3,0,1,1,1,1,0,1,0,1,0,0
4,1,1,0,0,0,0,0,0,0,2,1
5,0,0,1,1,1,0,1,0,1,0,0


### Bag of N-Grams Model

Bag of words treats a **word** as a **feature** where as Bag of N-grams treats a **phrase or a collection of words** as a **feature**.
An n-gram is basically a **collection of word tokens** from a text document such that these tokens are **contiguous and occur in a sequence**. **Bi-grams** indicate n-grams of order **2** (two words), **Tri-grams** indicate n-grams of order **3** (three words), and so on.

In [24]:
bv = CountVectorizer(ngram_range=(2, 2))
# Generate features using categorical values of features
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
bv_matrix

vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog lazy,fox jumps,fox quick,jumps lazy,lazy brown,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0
3,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
5,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0


### TF-IDF Model (Term Frequency-Inverse Document Frequency)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0
1,0.46,0.39,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.46,0.0
2,0.0,0.0,0.38,0.38,0.38,0.54,0.38,0.0,0.38,0.0,0.0
3,0.0,0.36,0.42,0.42,0.42,0.0,0.42,0.0,0.42,0.0,0.0
4,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.52
5,0.0,0.0,0.45,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.0


### Document Similarity

Document similarity is the **process of using a distance or similarity-based metric** that can be used **to identify how similar a text document** is with another document **based on features extracted from the documents** like **bag of words or tf-idf**.<br>
 - Feature vectors having similar orientation are close to $0^0$ are **similar** to each other.<br>
 - Feature vectors having similar orientation are close to $90^0$ are **not similar** to each other.<br>
 - Feature vectors having similar orientation are close to $180^0$ are **completely opposite** to each other.<br>


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.753128,0.0,0.185447,0.807539,0.0
1,0.753128,1.0,0.0,0.139665,0.608181,0.0
2,0.0,0.0,1.0,0.784362,0.0,0.839987
3,0.185447,0.139665,0.784362,1.0,0.109653,0.933779
4,0.807539,0.608181,0.0,0.109653,1.0,0.0
5,0.0,0.0,0.839987,0.933779,0.0,1.0


**Observations:**<br>
    - Features (i.e. documents) 0, 1 and 4 are similar among them.
    - Features (i.e. documents) 2, 3 and 5 are similar among them.
Let us verify observations by means of **K-means cluster** algorithm.

In [28]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,1
1,Love this blue and beautiful sky!,weather,1
2,The quick brown fox jumps over the lazy dog.,animals,0
3,The brown fox is quick and the blue dog is lazy!,animals,0
4,The sky is very blue and the sky is very beaut...,weather,1
5,The dog is lazy but the brown fox is quick!,animals,0


### Topic Models

Besides document terms, phrases and similarities, **we can also use some summarization techniques to extract topic or concept-based features from text documents**. The idea of topic models revolves around the process of extracting key themes or concepts from a corpus of documents which are represented as topics. **Each topic** can be represented **as a bag or collection of words/terms** from the document corpus. Together, these terms signify a specific topic, theme or a concept and each topic can be easily distinguished from other topics by virtue of the semantic meaning conveyed by these terms. These concepts can range from simple facts and statements to opinions and outlook. Topic models are extremely useful in summarizing large corpus of text documents to extract and depict key concepts. They are also useful in extracting features from text data that capture latent patterns in the data.<br>
There are many techniques for Topic modelling:<br>
- Latent Semantic Indexing (LSI)<br>
- Latent Dirichlet Allocation (LDA)<br>
We use LDA model to work practically on the problem we are discussing above:

In [34]:
# Method I - Document Vs Topic analysis
from sklearn.decomposition import LatentDirichletAllocation

# old parameter n_topics is renamed by n_components
lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)
# Document Topic matrix
dt_matrix = lda.fit_transform(tv_matrix)
# Topic 1 and Topic 2
features = pd.DataFrame(dt_matrix, columns = ['T1', 'T2'])
features

Unnamed: 0,T1,T2
0,0.190548,0.809452
1,0.176804,0.823196
2,0.846184,0.153816
3,0.814863,0.185137
4,0.180516,0.819484
5,0.839172,0.160828


**Observations:**
    - Documents 2, 3 and 5 are alike from the Document(0 through 5) vs Topic(T1)
    - Documents 0, 1 and 4 are alike from the Document(0 through 5) vs Topic(T2)
In the above program fragment, we've used Document vs Topic matrix.<br>
Similarly, We'll use Topic vs Term matrix as an alternative method:

In [35]:
# Method II - Topic Vs Term analysis
tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1])
    topic = [item for item in topic if item[1] > 0.6]
    print(topic)
    print()

[('brown', 1.7273638692668465), ('dog', 1.7273638692668465), ('fox', 1.7273638692668465), ('lazy', 1.7273638692668465), ('quick', 1.7273638692668465), ('jumps', 1.0328325272484777), ('blue', 0.7731573162915626)]

[('sky', 2.264386643135622), ('beautiful', 1.9068269319456903), ('blue', 1.7996282104933266), ('love', 1.148127242397004), ('today', 1.0068251160429935)]



**Observations:**
- **First set of terms** clearly shows they are relevant to **animals**
- **Second set of terms** shows that they are relevant to **weather**

### Word Embeddings

In [66]:
# Prerequisite - pip install gensim

from gensim.models import word2vec

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

# Set values for various parameters
feature_size = 10        # Word vector dimensionality
window_context = 10      # Context window size
min_word_count = 1       # Minimum word count
sample = 1e-3            # Downsample setting for frequent words

# Get the distributed representational vectors for words
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count = min_word_count,
                          sample=sample)
w2v_model.wv['sky']

array([-0.04296592,  0.04610303, -0.00349819,  0.01232337,  0.01463256,
        0.04618132,  0.00159843,  0.03548915, -0.02565535,  0.02186544],
      dtype=float32)

In [68]:
# As we get multiple vectors for each word in a document normalize finding average
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [69]:
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
pd.DataFrame(w2v_feature_array)

  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.022518,0.033792,0.024871,-0.008696,0.019062,0.023257,0.014872,0.015568,-0.030283,-0.010652
1,-0.007232,0.03364,0.007581,-0.007391,0.010677,0.01578,0.018046,0.022193,-0.018556,-0.01599
2,0.003368,-0.012638,-0.01703,0.005165,-0.006179,0.000409,-0.00655,-0.014589,-0.004247,0.005427
3,-0.006003,-0.006422,-0.007285,0.009889,0.005843,0.001301,0.000114,-0.019806,-0.002681,-0.005059
4,-0.024648,0.021392,0.019133,-0.005128,0.015988,0.020289,0.014106,0.020981,-0.029839,-0.001036
5,-0.001276,-0.009193,-0.016383,0.015352,0.002522,-0.006311,-0.008738,-0.02417,0.004879,-0.002574


In [70]:
from sklearn.cluster import AffinityPropagation

ap = AffinityPropagation()
ap.fit(w2v_feature_array)
cluster_labels = ap.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1
