# Feature Engineering in NLP:
### 1. Bag of Words using `CountVectorizer` from sklearn

- What is BoW?
- Steps to create BoW representation of a corpus?
- Applications/uses of BoW?

In [1]:
corpus = ["Matt is a fan of football",
          "He also likes to cook occasionally",
          "He is a nice guy"]
corpus

['Matt is a fan of football',
 'He also likes to cook occasionally',
 'He is a nice guy']

In [2]:
# create an object of countvectorizer class
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
print(type(cv))

<class 'sklearn.feature_extraction.text.CountVectorizer'>


In [3]:
# methods and attributes in CountVectorizer object
print(dir(cv))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_char_ngrams', '_char_wb_ngrams', '_check_feature_names', '_check_n_features', '_check_stop_words_consistency', '_check_vocabulary', '_count_vocab', '_get_param_names', '_get_tags', '_limit_features', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_sort_features', '_validate_data', '_validate_params', '_validate_vocabulary', '_warn_for_unused_params', '_white_spaces', '_word_ngrams', 'analyzer', 'binary', 'build_analyzer', 'build_preprocessor', 'build_tokenizer', 'decode', 'decode_error', 'dtype', 'encoding', 'fit', 'fit_transform', 'get_feature_names', 'get_feature_names_out', 'get_params',

In [4]:
# generate vocabulary dictionary and return a Document Turn Matrix
bow = cv.fit_transform(corpus)

In [5]:
# to print vocabulary
print(cv.vocabulary_)

{'matt': 8, 'is': 6, 'fan': 2, 'of': 11, 'football': 3, 'he': 5, 'also': 0, 'likes': 7, 'to': 12, 'cook': 1, 'occasionally': 10, 'nice': 9, 'guy': 4}


In [6]:
cv.get_feature_names_out()

array(['also', 'cook', 'fan', 'football', 'guy', 'he', 'is', 'likes',
       'matt', 'nice', 'occasionally', 'of', 'to'], dtype=object)

>- Note that single character words are not there in vocabulary

In [7]:
bow.shape

(3, 13)

In [8]:
bow

<3x13 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [9]:
# since bow is a sparse matrix, so to change it to dense matrix or array, we can use numpy toarray()
bow.toarray()

array([[0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0]])

In [10]:
bow.todense()

matrix([[0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0],
        [1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0]])

##### Let save the corpus as DTM of BoW representation

In [11]:
import pandas as pd
dtm = pd.DataFrame(data=bow.todense(), columns=cv.get_feature_names_out())
dtm

Unnamed: 0,also,cook,fan,football,guy,he,is,likes,matt,nice,occasionally,of,to
0,0,0,1,1,0,0,1,0,1,0,0,1,0
1,1,1,0,0,0,1,0,1,0,0,1,0,1
2,0,0,0,0,1,1,1,0,0,1,0,0,0


In [12]:
import pandas as pd
dtm = pd.DataFrame(data=bow.todense())
dtm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,0,1,1,0,0,1,0,1,0,0,1,0
1,1,1,0,0,0,1,0,1,0,0,1,0,1
2,0,0,0,0,1,1,1,0,0,1,0,0,0


### 2. Term Frequency - Inverse Document Frequency(TF-IDF)

In [13]:
corpus = ["Matt is a fan of football",
          "He also likes to cook occasionally",
          "He is a nice guy"]
corpus

['Matt is a fan of football',
 'He also likes to cook occasionally',
 'He is a nice guy']

In [15]:
# create an object of TfidfVectorizer class
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
print(type(tfidf_vec))

<class 'sklearn.feature_extraction.text.TfidfVectorizer'>


In [16]:
print(dir(tfidf_vec))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_char_ngrams', '_char_wb_ngrams', '_check_feature_names', '_check_n_features', '_check_params', '_check_stop_words_consistency', '_check_vocabulary', '_count_vocab', '_get_param_names', '_get_tags', '_limit_features', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_sort_features', '_validate_data', '_validate_params', '_validate_vocabulary', '_warn_for_unused_params', '_white_spaces', '_word_ngrams', 'analyzer', 'binary', 'build_analyzer', 'build_preprocessor', 'build_tokenizer', 'decode', 'decode_error', 'dtype', 'encoding', 'fit', 'fit_transform', 'get_feature_names', 'get_feature_names_ou

In [17]:
# generate vocabulary dictionary and return a Document Turn Matrix having tfidt values
tfidf = tfidf_vec.fit_transform(corpus)


In [18]:
# to print vocabulary
print(tfidf_vec.vocabulary_)

{'matt': 8, 'is': 6, 'fan': 2, 'of': 11, 'football': 3, 'he': 5, 'also': 0, 'likes': 7, 'to': 12, 'cook': 1, 'occasionally': 10, 'nice': 9, 'guy': 4}


In [19]:
print(tfidf_vec.get_feature_names_out())

['also' 'cook' 'fan' 'football' 'guy' 'he' 'is' 'likes' 'matt' 'nice'
 'occasionally' 'of' 'to']


In [20]:
tfidf.shape

(3, 13)

In [21]:
tfidf

<3x13 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [22]:
# since bow is a sparse matrix, so to change it to dense matrix or array, we can use numpy toarray()
tfidf.toarray()

array([[0.        , 0.        , 0.46735098, 0.46735098, 0.        ,
        0.        , 0.35543247, 0.        , 0.46735098, 0.        ,
        0.        , 0.46735098, 0.        ],
       [0.42339448, 0.42339448, 0.        , 0.        , 0.        ,
        0.32200242, 0.        , 0.42339448, 0.        , 0.        ,
        0.42339448, 0.        , 0.42339448],
       [0.        , 0.        , 0.        , 0.        , 0.5628291 ,
        0.42804604, 0.42804604, 0.        , 0.        , 0.5628291 ,
        0.        , 0.        , 0.        ]])

In [23]:
tfidf.todense()

matrix([[0.        , 0.        , 0.46735098, 0.46735098, 0.        ,
         0.        , 0.35543247, 0.        , 0.46735098, 0.        ,
         0.        , 0.46735098, 0.        ],
        [0.42339448, 0.42339448, 0.        , 0.        , 0.        ,
         0.32200242, 0.        , 0.42339448, 0.        , 0.        ,
         0.42339448, 0.        , 0.42339448],
        [0.        , 0.        , 0.        , 0.        , 0.5628291 ,
         0.42804604, 0.42804604, 0.        , 0.        , 0.5628291 ,
         0.        , 0.        , 0.        ]])

##### Let save the corpus as DTM of TF-IDF representation

In [24]:
import pandas as pd
dtm = pd.DataFrame(data=tfidf.todense(), columns=tfidf_vec.get_feature_names_out())
dtm

Unnamed: 0,also,cook,fan,football,guy,he,is,likes,matt,nice,occasionally,of,to
0,0.0,0.0,0.467351,0.467351,0.0,0.0,0.355432,0.0,0.467351,0.0,0.0,0.467351,0.0
1,0.423394,0.423394,0.0,0.0,0.0,0.322002,0.0,0.423394,0.0,0.0,0.423394,0.0,0.423394
2,0.0,0.0,0.0,0.0,0.562829,0.428046,0.428046,0.0,0.0,0.562829,0.0,0.0,0.0


## Word embedding Techniques
#### Word2Vec using SpaCy `en_core_web_lg` model

In [None]:
# import and load the language library
import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
len(nlp.vocabcab.vectors)

In [None]:
nlp.vocab.vectors.shape

#### Vectors representation of words

In [None]:
nlp(u'king').vector

In [None]:
nlp(u'man').vector

In [None]:
nlp(u'Ehtisham').vector

## Feature Engineering in ML

#### Imputation using `SimpleImputer()`

In [26]:
from sklearn.impute import SimpleImputer
si = SimpleImputer()
print(dir(si))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_feature_names', '_check_n_features', '_concatenate_indicator', '_concatenate_indicator_feature_names_out', '_dense_fit', '_fit_indicator', '_get_param_names', '_get_tags', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_sparse_fit', '_transform_indicator', '_validate_data', '_validate_input', 'add_indicator', 'copy', 'fill_value', 'fit', 'fit_transform', 'get_feature_names_out', 'get_params', 'inverse_transform', 'missing_values', 'set_params', 'strategy', 'transform', 'verbose']


In [None]:
# si.strategy()

#### One Hot/Categorical Encoding using `LabelEncoder`

In [27]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()

In [28]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#### Logarithmic Transformation using `FunctionTransformer`

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
 
# Load dataset
data = pd.read_csv('datasets/train.csv')
 
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.3,
    random_state=0
    )
 
# set up the variable transformer
tf = FunctionTransformer(np.log)
 
# fit the transformer
tf.fit(X_train[['LotArea', 'GrLivArea']])
 
# transform the data
train_t = tf.transform(X_train[['LotArea', 'GrLivArea']])
test_t = tf.transform(X_test[['LotArea', 'GrLivArea']])

In [46]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape


((1022, 79), (1022,), (438, 79), (438,))

In [41]:
data.shape

(1460, 81)

In [30]:
data[['LotArea', 'GrLivArea']].head()

Unnamed: 0,LotArea,GrLivArea
0,8450,1710
1,9600,1262
2,11250,1786
3,9550,1717
4,14260,2198


In [32]:
train_t.head()

Unnamed: 0,LotArea,GrLivArea
64,9.145802,7.61776
682,7.967973,7.163172
960,8.882808,6.754604
1384,9.111624,7.137278
1100,9.035987,6.082219


In [33]:
test_t.head()

Unnamed: 0,LotArea,GrLivArea
529,10.394151,7.830028
491,9.157994,7.363914
459,8.855806,7.092574
279,9.21084,7.611842
655,7.426549,6.995766


#### Discretization using `KBinsDiscretizer`

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
 
# Load dataset
data = data = pd.read_csv('datasets/train.csv')
 
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.3,
    random_state=0
    )
 
# set up the discretisation transformer
disc = KBinsDiscretizer(n_bins=10, strategy='quantile')
 
# fit the transformer
disc.fit(X_train[['LotArea', 'GrLivArea']])
 
# transform the data
train_t = disc.transform(X_train[['LotArea', 'GrLivArea']])
test_t = disc.transform(X_test[['LotArea', 'GrLivArea']])

In [35]:
data[['LotArea', 'GrLivArea']].head()

Unnamed: 0,LotArea,GrLivArea
0,8450,1710
1,9600,1262
2,11250,1786
3,9550,1717
4,14260,2198


In [40]:
train_t

<1022x20 sparse matrix of type '<class 'numpy.float64'>'
	with 2044 stored elements in Compressed Sparse Row format>

In [38]:
train_t.toarray()

array([[0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [39]:
test_t.toarray()

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:
from sklearn.p