# Feature Extraction

The sklearn.feature_extraction module can be used to extract features in a format supported by machine learning algorithms from datasets consisting of formats such as text and image.

In [57]:
# SKL-FE1
# demonstrate one-hot (one-of-K) encode of territory names

region_sales = [
    {'territory': 'Northeast', 'sales': 41},       # define unique sales territories
    {'territory': 'SouthEast', 'sales': 8},
    {'territory': 'Central', 'sales': 16},
    {'territory': 'West', 'sales': 11},
    {'territory': 'Europe', 'sales': 43},
    {'territory': 'Asia', 'sales': 122},           # Python permits trailing comma
]

from sklearn.feature_extraction import DictVectorizer    # import namespace
vec = DictVectorizer()                                   # create feature extraction instance
print('vector output: ')
print(vec.fit_transform(region_sales).toarray())         # transform and format for printing
print()
print('feature names: ', vec.get_feature_names())

vector output: 
[[  41.    0.    0.    0.    1.    0.    0.]
 [   8.    0.    0.    0.    0.    1.    0.]
 [  16.    0.    1.    0.    0.    0.    0.]
 [  11.    0.    0.    0.    0.    0.    1.]
 [  43.    0.    0.    1.    0.    0.    0.]
 [ 122.    1.    0.    0.    0.    0.    0.]]

feature names:  ['sales', 'territory=Asia', 'territory=Central', 'territory=Europe', 'territory=Northeast', 'territory=SouthEast', 'territory=West']


In [59]:
# SKL-FE2
# feature hasher
# high-speed, low-memory vectorizer (does not support an inverse transform)
from sklearn.feature_extraction import FeatureHasher
hash = FeatureHasher(n_features=10, input_type='string')
data = [['Asia', 'Europe', 'Asia'],     # Note: 'Asia' appears twice (see transform output)
        ['Asia', 'Northeast']]
output = hash.transform(data)
output.toarray()

array([[ 2.,  0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0.],
       [ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [37]:
# SKL-FE3
# CountVectorizer defaults
# CountVectorizer implements both tokenization and occurrence counting in a single class
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
# SKL-FE4
# train a countvectorizer
# dump vector map
# print feature names
# locate a feature index
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

documents = [
    'De do do do, de da da da',
    'Is all I want to say to you',
    'De do do do, de da da da',
    'Their innocence will pull me through',
]
X = vectorizer.fit_transform(documents)
print('vectorizer transform: ')
print(X.toarray())
print()
print(vectorizer.get_feature_names())
print()
print('get "will" feature index: ',vectorizer.vocabulary_.get('will'))

vectorizer transform: 
[[0 3 2 3 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 1 0 0 1 0 0 2 1 0 1]
 [0 3 2 3 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 1 1 0 1 1 0 0 1 0]]

['all', 'da', 'de', 'do', 'innocence', 'is', 'me', 'pull', 'say', 'their', 'through', 'to', 'want', 'will', 'you']

get "will" feature index:  13


In [71]:
# SKL-FE5

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

documents = [
    'De do do do, de da da da',
    'Is all I want to say to you',
    'De do do do, de da da da',
    'Their innocence will pull me through',
]
X = vectorizer.fit_transform(documents)

# run a new phrase through the trained data (Note: 'to' and 'their' are identified, as they appear in the training)
vectorizer.transform(["Poets priests and politicians, Have words to thank for their positions"]).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [1]:
# SKL-FE6
# 2-gram and 1-gram word extractions 
# vectorized output is larger 
# preserves some of the local ordering information 
  
from sklearn.feature_extraction.text import CountVectorizer 
documents = [ 
    'De do do do, de da da da', 
    'Is all I want to say to you', 
    'De do do do, de da da da', 
    'Their innocence will pull me through', 
] 
  
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),                  # extract 1 and 2 word combinations 
                                    token_pattern=r'\b\w+\b', min_df=1)  # \b matches 'zero-width' before the character 
X_2 = bigram_vectorizer.fit_transform(documents).toarray()               # \w 'word' characters [A-Za-z0-9_]  
print('vectorized array: ',X_2) 
  
print('Index of "do": ',bigram_vectorizer.vocabulary_.get('do')) 
print('Index of "de do": ',bigram_vectorizer.vocabulary_.get('de do')) 
print('Index of "da da": ',bigram_vectorizer.vocabulary_.get('da da')) 
print('Index of "innocence will": ',bigram_vectorizer.vocabulary_.get('innocence will'))   # word prior to 'will' 
print('Index of "will pull": ',bigram_vectorizer.vocabulary_.get('will pull'))             # word after 'will' 

vectorized array:  [[0 0 3 2 2 1 1 3 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 0 2 1 1 1 1 0 0 1]
 [0 0 3 2 2 1 1 3 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0]]
Index of "do":  7
Index of "de do":  6
Index of "da da":  3
Index of "innocence will":  13
Index of "will pull":  31
