## `feature_extraction` in sklearn

### Vectorizing dictionaries 

In [1]:
from sklearn.feature_extraction import DictVectorizer

In [2]:
vecObject = DictVectorizer()

In [3]:
dictionaries = [{'city':'melbourne', 'temperature':32},
                {'city':'melbourne', 'temperature':32},
                {'city':'melbourne', 'temperature':30},
                {'city':'brisbane', 'temperature':26},
                {'city':'tasmania', 'temperature':12},
                {'city':'tasmania', 'temperature':14},
                {'city':'brisbane', 'temperature':23}]

In [4]:
import pandas

  return f(*args, **kwds)
  return f(*args, **kwds)


In [5]:
data = pandas.DataFrame(dictionaries)

In [6]:
matrix = vecObject.fit_transform(dictionaries)
matrix

<7x4 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [7]:
matrix.toarray()

array([[ 0.,  1.,  0., 32.],
       [ 0.,  1.,  0., 32.],
       [ 0.,  1.,  0., 30.],
       [ 1.,  0.,  0., 26.],
       [ 0.,  0.,  1., 12.],
       [ 0.,  0.,  1., 14.],
       [ 1.,  0.,  0., 23.]])

In [8]:
vecObject.get_feature_names()

['city=brisbane', 'city=melbourne', 'city=tasmania', 'temperature']

In [9]:
data = pandas.DataFrame(matrix.toarray(), columns = vecObject.get_feature_names())

In [10]:
data

Unnamed: 0,city=brisbane,city=melbourne,city=tasmania,temperature
0,0.0,1.0,0.0,32.0
1,0.0,1.0,0.0,32.0
2,0.0,1.0,0.0,30.0
3,1.0,0.0,0.0,26.0
4,0.0,0.0,1.0,12.0
5,0.0,0.0,1.0,14.0
6,1.0,0.0,0.0,23.0


### Text extraction

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
corpus = ["today is a sunny day",
          "the sun is out today",
          "the sun is massive",
         "how hot is the sun?"]

In [13]:
vectorizer = CountVectorizer()

In [14]:
matrix = vectorizer.fit_transform(corpus)
matrix

<4x10 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [15]:
matrix.toarray()

array([[1, 0, 0, 1, 0, 0, 0, 1, 0, 1],
       [0, 0, 0, 1, 0, 1, 1, 0, 1, 1],
       [0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [16]:
vectorizer.get_feature_names()

['day', 'hot', 'how', 'is', 'massive', 'out', 'sun', 'sunny', 'the', 'today']

In [17]:
data = pandas.DataFrame(matrix.toarray(),
                       columns = vectorizer.get_feature_names())

In [18]:
data

Unnamed: 0,day,hot,how,is,massive,out,sun,sunny,the,today
0,1,0,0,1,0,0,0,1,0,1
1,0,0,0,1,0,1,1,0,1,1
2,0,0,0,1,1,0,1,0,1,0
3,0,1,1,1,0,0,1,0,1,0


### TFIDF Vectorizer (without normalization)

In [19]:
tvectorizer = TfidfVectorizer(norm = False)

In [25]:
matrix2 = tvectorizer.fit_transform(corpus).toarray()
matrix2

array([[1.91629073, 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 1.91629073, 0.        , 1.51082562],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        1.91629073, 1.22314355, 0.        , 1.22314355, 1.51082562],
       [0.        , 0.        , 0.        , 1.        , 1.91629073,
        0.        , 1.22314355, 0.        , 1.22314355, 0.        ],
       [0.        , 1.91629073, 1.91629073, 1.        , 0.        ,
        0.        , 1.22314355, 0.        , 1.22314355, 0.        ]])

In [22]:
tvectorizer.get_feature_names()

['day', 'hot', 'how', 'is', 'massive', 'out', 'sun', 'sunny', 'the', 'today']

In [26]:
data = pandas.DataFrame(matrix2, columns = tvectorizer.get_feature_names())

data

Unnamed: 0,day,hot,how,is,massive,out,sun,sunny,the,today
0,1.916291,0.0,0.0,1.0,0.0,0.0,0.0,1.916291,0.0,1.510826
1,0.0,0.0,0.0,1.0,0.0,1.916291,1.223144,0.0,1.223144,1.510826
2,0.0,0.0,0.0,1.0,1.916291,0.0,1.223144,0.0,1.223144,0.0
3,0.0,1.916291,1.916291,1.0,0.0,0.0,1.223144,0.0,1.223144,0.0


### TFIDF Vectorizer (with normalization)
- l2 normalization used by default

In [33]:
tvectorizer = TfidfVectorizer()
tvectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [34]:
matrix2 = tvectorizer.fit_transform(corpus).toarray()
matrix2

array([[0.58783765, 0.        , 0.        , 0.30675807, 0.        ,
        0.        , 0.        , 0.58783765, 0.        , 0.46345796],
       [0.        , 0.        , 0.        , 0.31707032, 0.        ,
        0.60759891, 0.38782252, 0.        , 0.38782252, 0.47903796],
       [0.        , 0.        , 0.        , 0.3612126 , 0.69218835,
        0.        , 0.44181486, 0.        , 0.44181486, 0.        ],
       [0.        , 0.56914364, 0.56914364, 0.29700276, 0.        ,
        0.        , 0.36327702, 0.        , 0.36327702, 0.        ]])

In [35]:
tvectorizer.get_feature_names()

['day', 'hot', 'how', 'is', 'massive', 'out', 'sun', 'sunny', 'the', 'today']

In [36]:
data = pandas.DataFrame(matrix2, columns = tvectorizer.get_feature_names())

data

Unnamed: 0,day,hot,how,is,massive,out,sun,sunny,the,today
0,0.587838,0.0,0.0,0.306758,0.0,0.0,0.0,0.587838,0.0,0.463458
1,0.0,0.0,0.0,0.31707,0.0,0.607599,0.387823,0.0,0.387823,0.479038
2,0.0,0.0,0.0,0.361213,0.692188,0.0,0.441815,0.0,0.441815,0.0
3,0.0,0.569144,0.569144,0.297003,0.0,0.0,0.363277,0.0,0.363277,0.0
