#### Features are represented in 
* categorical data, text, images.
* Derived features
* Imputation of missing data
* The mechanism of converting data into right format is known as vectorization

##### Categorical Features

In [16]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [17]:
import pandas as pd

In [18]:
pdata = pd.DataFrame(data)

In [19]:
pdata

Unnamed: 0,neighborhood,price,rooms
0,Queen Anne,850000,4
1,Fremont,700000,3
2,Wallingford,650000,3
3,Fremont,600000,2


In [20]:
from sklearn.feature_extraction import DictVectorizer

In [21]:
vec = DictVectorizer(sparse=False, dtype=int)

In [22]:
vec.fit(data)

DictVectorizer(dtype=<class 'int'>, separator='=', sort=True, sparse=False)

In [23]:
vec.transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])

In [24]:
#print column names
vec.get_feature_names()

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']

In [10]:
vec.transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])

### Text Features
* Convert text into numerical values
* Higher frequency of a word in text for conveying some information

In [2]:
sample = ['problem of evil',
          'evil queen of evil evil',
          'horizon of problem evil']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample)

In [3]:
X.toarray()

array([[1, 0, 1, 1, 0],
       [3, 0, 1, 0, 1],
       [1, 1, 1, 1, 0]], dtype=int64)

In [17]:
vec.get_feature_names()

['evil', 'horizon', 'of', 'problem', 'queen']

### TF-IDF - Term frequency & Inverse document frequency
* corpus - collection of entire documents
* This algorithm tries to find the words more common in one doc compared to entire corpus

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(ngram_range=[1,3])

In [16]:
sample = ['problem of good',
          'evil queen of not good',
          'horizon of problem not good']

In [17]:
X = vec.fit_transform(sample)

In [18]:
X.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.28561676,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.28561676,
         0.48359121,  0.        ,  0.        ,  0.        ,  0.        ,
         0.36778358,  0.        ,  0.        ,  0.48359121,  0.48359121,
         0.        ,  0.        ,  0.        ],
       [ 0.31855448,  0.31855448,  0.31855448,  0.18814341,  0.        ,
         0.        ,  0.        ,  0.2422689 ,  0.2422689 ,  0.18814341,
         0.        ,  0.31855448,  0.31855448,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.31855448,  0.31855448,  0.31855448],
       [ 0.        ,  0.        ,  0.        ,  0.19230198,  0.32559555,
         0.32559555,  0.32559555,  0.24762381,  0.24762381,  0.19230198,
         0.        ,  0.        ,  0.        ,  0.32559555,  0.32559555,
         0.24762381,  0.32559555,  0.32559555,  0.        ,  0.        ,
         0.        ,  0.    

In [19]:
vec.get_feature_names()

['evil',
 'evil queen',
 'evil queen of',
 'good',
 'horizon',
 'horizon of',
 'horizon of problem',
 'not',
 'not good',
 'of',
 'of good',
 'of not',
 'of not good',
 'of problem',
 'of problem not',
 'problem',
 'problem not',
 'problem not good',
 'problem of',
 'problem of good',
 'queen',
 'queen of',
 'queen of not']

In [33]:
from sklearn.preprocessing import LabelEncoder 
le=LabelEncoder()

In [35]:
pdata['ne'] = le.fit_transform(pdata.neighborhood)

In [36]:
pdata

Unnamed: 0,neighborhood,price,rooms,ne
0,Queen Anne,850000,4,1
1,Fremont,700000,3,0
2,Wallingford,650000,3,2
3,Fremont,600000,2,0


### Imputation of missing data
* Replacing missing values by appropiate information


In [1]:

import numpy as np
from numpy import nan
X = np.array([[ nan, 0,   3  ],
              [ 3,   7,   9  ],
              [ 3,   5,   2  ],
              [ 4,   nan, 6  ],
              [ 8,   8,   1  ]])
y = np.array([14, 16, -1,  8, -5])

In [5]:
from sklearn.preprocessing import Imputer
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)

In [6]:
X2

array([[ 4.5,  0. ,  3. ],
       [ 3. ,  7. ,  9. ],
       [ 3. ,  5. ,  2. ],
       [ 4. ,  5. ,  6. ],
       [ 8. ,  8. ,  1. ]])

In [22]:
help(Imputer)

Help on class Imputer in module sklearn.preprocessing.imputation:

class Imputer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide <imputation>`.
 |  
 |  Parameters
 |  ----------
 |  missing_values : integer or "NaN", optional (default="NaN")
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed. For missing values encoded as np.nan,
 |      use the string value "NaN".
 |  
 |  strategy : string, optional (default="mean")
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        the axis.
 |      - If "median", then replace missing values using the median along
 |        the axis.
 |      - If "most_frequent", then replace missing using the most frequent
 |        value along the axis.
 |  
 |  axis : integer, optional (default=0)
 |      The axis along which to i

### Feature Pipelines

In [23]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to CountVectorizer followed by TfidfTransformer.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : string {'filename', 'file', 'content'}
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |      the raw content to analyze.
 |  
 |      If 'file', the sequence items must have a 'read' method (file-like
 |      object) that is called to fetch the bytes in memory.
 |  
 |      Otherwise the input is expected to be the sequence strings or
 |      bytes items are expected to be analyzed directly.
 |  
 |  encoding : string, 'utf-8' by default.
 |      If bytes or files are given to analyze, this encoding is used to
 |      decode.
 |

In [11]:
from sklearn.preprocessing import LabelEncoder 
le=LabelEncoder()

In [13]:
pdata

Unnamed: 0,neighborhood,price,rooms
0,Queen Anne,850000,4
1,Fremont,700000,3
2,Wallingford,650000,3
3,Fremont,600000,2


In [14]:
pdata['neigh'] = le.fit_transform(pdata['neighborhood']) 

In [15]:
pdata

Unnamed: 0,neighborhood,price,rooms,neigh
0,Queen Anne,850000,4,1
1,Fremont,700000,3,0
2,Wallingford,650000,3,2
3,Fremont,600000,2,0


In [None]:
le.transform()