Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
54 lines (35 sloc) 2.12 KB
"""Extract common meta features from text"""
from h2oaicore.transformer_utils import CustomTransformer
import datatable as dt
import numpy as np
import string
class WordBaseTransformer:
@staticmethod
def get_default_properties():
return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
def fit_transform(self, X: dt.Frame, y: np.array = None):
return self.transform(X)
class CountWordsTransformer(WordBaseTransformer, CustomTransformer):
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: len(x.split()))
class CountUniqueWordsTransformer(WordBaseTransformer, CustomTransformer):
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: len(set(x.split())))
class CountUpperWordsTransformer(WordBaseTransformer, CustomTransformer):
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: len([w for w in x.split() if w.isupper()]))
class CountNumericWordsTransformer(WordBaseTransformer, CustomTransformer):
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: len([w for w in x.split() if w.isnumeric()]))
class CountUpperCharsTransformer(WordBaseTransformer, CustomTransformer):
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: len([c for c in x if c.isupper()]))
class CountNumericCharsTransformer(WordBaseTransformer, CustomTransformer):
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: len([c for c in x if c.isnumeric()]))
class CountPunctCharsTransformer(WordBaseTransformer, CustomTransformer):
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: len([c for c in x if c in string.punctuation]))
class MeanWordLengthTransformer(WordBaseTransformer, CustomTransformer):
def transform(self, X: dt.Frame):
return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
You can’t perform that action at this time.