#### Features are represented in 
* categorical data, text, images.
* Derived features
* Imputation of missing data
* The mechanism of converting data into right format is known as vectorization

##### Categorical Features

In [1]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [2]:
import pandas as pd

In [4]:
pdata = pd.DataFrame(data)

In [5]:
pdata

Unnamed: 0,neighborhood,price,rooms
0,Queen Anne,850000,4
1,Fremont,700000,3
2,Wallingford,650000,3
3,Fremont,600000,2


In [6]:
from sklearn.feature_extraction import DictVectorizer

In [7]:
vec = DictVectorizer(sparse=False, dtype=int)

In [9]:
vec.fit_transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]], dtype=int32)

In [12]:
#print column names
vec.get_feature_names()

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']

### Text Features
* Convert text into numerical values
* Higher frequency of a word in text for conveying some information

In [31]:
sample = ['problem of evil',
          'evil queen of evil evil',
          'horizon of problem evil']

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample)

In [20]:
X.toarray()

array([[1, 0, 1, 1, 0],
       [2, 0, 0, 0, 1],
       [0, 1, 0, 1, 0]], dtype=int64)

In [17]:
vec.get_feature_names()

['evil', 'horizon', 'of', 'problem', 'queen']

### TF-IDF - Term frequency & Inverse document frequency
* corpus - collection of entire documents
* This algorithm tries to find the words more common in one doc compared to entire corpus

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()

In [32]:
X = vec.fit_transform(sample)

In [33]:
X.toarray()

array([[ 0.52284231,  0.        ,  0.52284231,  0.67325467,  0.        ],
       [ 0.8363477 ,  0.        ,  0.27878257,  0.        ,  0.47201992],
       [ 0.39148397,  0.66283998,  0.39148397,  0.50410689,  0.        ]])

In [24]:
vec.get_feature_names()

['evil', 'horizon', 'of', 'problem', 'queen']

### Imputation of missing data
* Replacing missing values by appropiate information


In [35]:

import numpy as np
from numpy import nan
X = np.array([[ nan, 0,   3  ],
              [ 3,   7,   9  ],
              [ 3,   5,   2  ],
              [ 4,   nan, 6  ],
              [ 8,   8,   1  ]])
y = np.array([14, 16, -1,  8, -5])

In [36]:
from sklearn.preprocessing import Imputer
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)

In [37]:
X2

array([[ 4.5,  0. ,  3. ],
       [ 3. ,  7. ,  9. ],
       [ 3. ,  5. ,  2. ],
       [ 4. ,  5. ,  6. ],
       [ 8. ,  8. ,  1. ]])