## Example of extracting text features from data frame columns containing text

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer  # Equivalent to CountVectorizer followed by TfidfTransformer

In [53]:
import pandas as pd
import numpy as np
import itertools

In [54]:
d = pd.DataFrame({"col1":"interesting read and stuff, this is quite read indeed, superflow in the the flowin".split(","), 
                  "col2":"interesting weeqa is stuff, this is quite read you, superflow in the you flowin".split(",")})

In [55]:
d

Unnamed: 0,col1,col2
0,interesting read and stuff,interesting weeqa is stuff
1,this is quite read indeed,this is quite read you
2,superflow in the the flowin,superflow in the you flowin


In [56]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b',strip_accents="ascii", stop_words="english", min_df=1)

In [57]:
X = vectorizer.fit_transform(itertools.chain(d["col1"],d["col2"]))

In [58]:
X.toarray()

array([[ 0.        ,  0.41932846,  0.51136725,  0.        ,  0.        ,
         0.        ,  0.3540259 ,  0.51136725,  0.41932846,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.60714432,
         0.60714432,  0.51259296,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.57735027,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.57735027,
         0.57735027,  0.        ,  0.        ],
       [ 0.        ,  0.39339985,  0.        ,  0.47974754,  0.        ,
         0.        ,  0.        ,  0.        ,  0.39339985,  0.        ,
         0.        ,  0.47974754,  0.47974754],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.60714432,
         0.60714432,  0.51259296,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.57735027,  0.      

In [59]:
vectorizer.get_feature_names() 

['flowin',
 'interesting',
 'interesting read',
 'interesting weeqa',
 'quite',
 'quite read',
 'read',
 'read stuff',
 'stuff',
 'superflow',
 'superflow flowin',
 'weeqa',
 'weeqa stuff']

In [61]:
vectorizer.vocabulary_

{'flowin': 0,
 'interesting': 1,
 'interesting read': 2,
 'interesting weeqa': 3,
 'quite': 4,
 'quite read': 5,
 'read': 6,
 'read stuff': 7,
 'stuff': 8,
 'superflow': 9,
 'superflow flowin': 10,
 'weeqa': 11,
 'weeqa stuff': 12}