# Feature Engineering
## Textual Data

This notebook show cases some of the common methods for feature extraction and engineering on textual data.

## Important Imports

In [1]:
import numpy as np
import pandas as pd
from collections import Counter

# pandas display data frames as tables
from IPython.display import display, HTML

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

---

## Prepare a Sample Corpus

In [2]:
corpus = ['pack my box with five dozen liquor jugs.',
 'pack my box',
 'the quick brown fox jumps over the lazy dog.',
 'the brown fox is quick and the blue dog is lazy',
 'pack my box with five dozen liquor jugs and biscuits',
 'the dog is lazy but the brown fox is quick']

labels = ['picnic', 'picnic', 'animals', 'animals', 'picnic', 'animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'document': corpus, 
                          'category': labels})
corpus_df = corpus_df[['document', 'category']]
corpus_df

Unnamed: 0,document,category
0,pack my box with five dozen liquor jugs.,picnic
1,pack my box,picnic
2,the quick brown fox jumps over the lazy dog.,animals
3,the brown fox is quick and the blue dog is lazy,animals
4,pack my box with five dozen liquor jugs and bi...,picnic
5,the dog is lazy but the brown fox is quick,animals


## Bag of Words

In [3]:
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_df.document)
cv_matrix = cv_matrix.toarray()
vocab = cv.get_feature_names()

pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,and,biscuits,blue,box,brown,but,dog,dozen,five,fox,...,jugs,jumps,lazy,liquor,my,over,pack,quick,the,with
0,0,0,0,1,0,0,0,1,1,0,...,1,0,0,1,1,0,1,0,0,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,0,0,0,0,1,0,1,0,0,1,...,0,1,1,0,0,1,0,1,2,0
3,1,0,1,0,1,0,1,0,0,1,...,0,0,1,0,0,0,0,1,2,0
4,1,1,0,1,0,0,0,1,1,0,...,1,0,0,1,1,0,1,0,0,1
5,0,0,0,0,1,1,1,0,0,1,...,0,0,1,0,0,0,0,1,2,0


## TF-IDF 

In [4]:
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(corpus_df.document)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,and,biscuits,blue,box,brown,but,dog,dozen,five,fox,...,jugs,jumps,lazy,liquor,my,over,pack,quick,the,with
0,0.0,0.0,0.0,0.32,0.0,0.0,0.0,0.37,0.37,0.0,...,0.37,0.0,0.0,0.37,0.32,0.0,0.32,0.0,0.0,0.37
1,0.0,0.0,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.58,0.0,0.58,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.28,0.0,0.28,0.0,0.0,0.28,...,0.0,0.4,0.28,0.0,0.0,0.4,0.0,0.28,0.55,0.0
3,0.28,0.0,0.34,0.0,0.24,0.0,0.24,0.0,0.0,0.24,...,0.0,0.0,0.24,0.0,0.0,0.0,0.0,0.24,0.47,0.0
4,0.32,0.39,0.0,0.27,0.0,0.0,0.0,0.32,0.32,0.0,...,0.32,0.0,0.0,0.32,0.27,0.0,0.27,0.0,0.0,0.32
5,0.0,0.0,0.0,0.0,0.24,0.35,0.24,0.0,0.0,0.24,...,0.0,0.0,0.24,0.0,0.0,0.0,0.0,0.24,0.49,0.0


## N-Gram Vectorizer

In [5]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(corpus_df.document)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,and biscuits,and the,blue dog,box with,brown fox,but the,dog is,dozen liquor,five dozen,fox is,...,over the,pack my,quick and,quick brown,the blue,the brown,the dog,the lazy,the quick,with five
0,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,1,0
3,0,1,1,0,1,0,1,0,0,1,...,0,0,1,0,1,1,0,0,0,0
4,1,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,1
5,0,0,0,0,1,1,1,0,0,1,...,0,0,0,0,0,1,1,0,0,0
