In [1]:
import json

def pp(a_dict):
    print(json.dumps(a_dict, indent=4))

In [2]:
from mlblocks.discovery import find_pipelines, load_pipeline, load_primitive

In [3]:
filters = {
    'metadata.data_type': 'text'
}
find_pipelines(filters=filters)

['keras.Sequential.LSTMBinaryTextClassifier',
 'keras.Sequential.LSTMTextClassifier',
 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
 'mlprimitives.custom.feature_extraction.DatetimeFeaturizer',
 'mlprimitives.custom.feature_extraction.StringVectorizer',
 'mlprimitives.custom.text.TextCleaner',
 'text.classification.lstm']

In [4]:
find_pipelines('text')

['mlprimitives.custom.text.TextCleaner',
 'single_table.classification.text',
 'single_table.regression.text',
 'text.classification.lstm']

In [5]:
pp(load_pipeline('single_table.classification.text'))

{
    "metadata": {
        "name": "single_table/classification/text",
        "data_type": "single_table",
        "task_type": "classification"
    },
    "validation": {
        "dataset": "personae"
    },
    "primitives": [
        "mlprimitives.custom.preprocessing.ClassEncoder",
        "mlprimitives.custom.text.TextCleaner",
        "mlprimitives.custom.feature_extraction.StringVectorizer",
        "sklearn.impute.SimpleImputer",
        "sklearn.ensemble.RandomForestClassifier",
        "mlprimitives.custom.preprocessing.ClassDecoder"
    ],
    "init_params": {
        "mlprimitives.custom.text.TextCleaner": {
            "column": "text"
        }
    }
}


In [6]:
pipeline_dict = load_pipeline('single_table.regression.text')

In [7]:
pp(pipeline_dict)

{
    "metadata": {
        "name": "single_table/regression/text",
        "data_type": "single_table",
        "task_type": "regression"
    },
    "validation": {
        "dataset": "reviews"
    },
    "primitives": [
        "mlprimitives.custom.feature_extraction.CategoricalEncoder",
        "mlprimitives.custom.feature_extraction.StringVectorizer",
        "sklearn.impute.SimpleImputer",
        "xgboost.XGBRegressor"
    ]
}


In [13]:
from mlprimitives.datasets import _load, _load_csv, Dataset
from sklearn.metrics import r2_score

dataset_path = _load('reviews')

X = _load_csv(dataset_path, 'data')
y = X.pop('evaluation').values

reviews = Dataset('\n', X, y, r2_score, 'text', 'regression', 'univariate')

In [14]:
reviews.data.head()

Unnamed: 0,confidence,id,lan,orientation,remarks,text,timespan
0,4.0,1,es,0,,- El artículo aborda un problema contingente y...,2010-07-05
1,4.0,2,es,1,,El artículo presenta recomendaciones prácticas...,2010-07-05
2,5.0,3,es,1,,- El tema es muy interesante y puede ser de mu...,2010-07-05
3,4.0,1,es,1,,Se explica en forma ordenada y didáctica una e...,2010-07-05
4,4.0,2,es,0,,,2010-07-05


In [15]:
reviews.target[0:5]

array([1, 1, 1, 2, 2])

In [16]:
import pandas as pd

pd.Series(reviews.target).unique()

array([ 1,  2,  0, -2, -1])

In [27]:
pipeline_dict = {
    "metadata": {
        "name": "single_table/regression/text",
        "data_type": "single_table",
        "task_type": "regression"
    },
    "validation": {
        "dataset": "reviews"
    },
    "primitives": [
        "mlprimitives.custom.text.TextCleaner",
        "mlprimitives.custom.text.TextCleaner",
        "mlprimitives.custom.feature_extraction.StringVectorizer",
        "mlprimitives.custom.feature_extraction.CategoricalEncoder",
        "sklearn.impute.SimpleImputer",
        'mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector',
        "xgboost.XGBRegressor"
    ],
    "init_params": {
        "mlprimitives.custom.text.TextCleaner#1": {
            "column": "text",
            "stopwords": False,
        },
        "mlprimitives.custom.text.TextCleaner#2": {
            "column": "remarks",
            "stopwords": False,
        }
    }
}

In [28]:
from mlblocks import MLPipeline

pipeline = MLPipeline(pipeline_dict)

In [29]:
pipeline.primitives

['mlprimitives.custom.text.TextCleaner',
 'mlprimitives.custom.text.TextCleaner',
 'mlprimitives.custom.feature_extraction.StringVectorizer',
 'mlprimitives.custom.feature_extraction.CategoricalEncoder',
 'sklearn.impute.SimpleImputer',
 'mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector',
 'xgboost.XGBRegressor']

In [30]:
X_train, X_test, y_train, y_test = reviews.get_splits(1)

In [31]:
pipeline.fit(X_train, y_train)





In [32]:
pred = pipeline.predict(X_test)

In [33]:
y_test[0:5]

array([-1,  2,  1,  2, -2])

In [34]:
pred[0:5]

array([-0.80070484,  1.2240283 ,  1.0889438 , -1.1889812 , -1.9127965 ],
      dtype=float32)

In [35]:
reviews.score(y_test, pred)

0.5639611484207994