# Ejemplo de pandas-sklearn


Import what you need from the sklearn_pandas package. The choices are:

- `DataFrameMapper`, a class for mapping pandas data frame columns to different sklearn transformations
- `cross_val_score`, similar to sklearn.cross_validation.cross_val_score but working on pandas DataFrames



In [33]:
import sys

sys.path.append("../..")

In [42]:
import numpy as np
from sklearn_pandas import DataFrameMapper, cross_val_score
from transformers import LenTransformer, SpaceTransformer, options, BaseTransformer
from sklearn.pipeline import make_union
from sklearn.feature_extraction.text import TfidfVectorizer

from IPython.core.debugger import Tracer

class LenTransformer(BaseTransformer):
    """Clase que agrega len al coso este."""

    def transform(self, data):
        u"""Aplica la transformación."""
        return np.array([len(t) for t in data]).reshape(-1, 1)

class SpaceTransformer(BaseTransformer):
    """Clase que agrega len al coso este."""

    def transform(self, data):
        u"""Aplica la transformación."""
        return np.array([t.count(" ") for t in data]).reshape(-1, 1)


class AddWordsTransformer(BaseTransformer):
    """Agrega counts de varias palabras."""

    def transform(self, data):
        u"""Aplica la transformación."""
        words = [
            "dear", "friend", "hello""$", "earn", "investment", "profit",
            "profits", "credit", "opportunity", "income", "cost" "promotion",
            "why pay more?", "click", "add",
            "meet singles", "viagra", "sex", "penis", "vagina", "pussy",
            "fuck", "girl", "erect", "enlargement"
            "free", "cc:", "gif", "help", "photo", "video", "http", "dollar",
            "million", "|", "nigeria", "million", "password", "of", "bill",
            "it's time", "sale", "hi", "-->", "weight", "lose",
            "administrator", "order", "clearance", "meet singles"
        ]
        
        new_columns = []

        for word in words:
            new_columns.append([t.lower().count(word) for t in data])

        ret = np.array(new_columns)
        
        return ret.transpose()

payload_transformer = make_union(
    SpaceTransformer(),
    LenTransformer(),
    AddWordsTransformer(),
    TfidfVectorizer(**options),
)


extractor = DataFrameMapper([
    ('payload', payload_transformer)
])

In [43]:
from data_builder import load_test_data, load_dev_data, load_small_dev_data


df, target = load_small_dev_data()

print "%s registros" % df.shape[0]

8099 registros


In [44]:
extractor.fit(df, target)

DataFrameMapper(default=False,
        features=[('payload', FeatureUnion(n_jobs=1,
       transformer_list=[('spacetransformer', SpaceTransformer()), ('lentransformer', LenTransformer()), ('addwordstransformer', AddWordsTransformer()), ('tfidfvectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u...        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))],
       transformer_weights=None))],
        sparse=False)

In [45]:
X = extractor.transform(df)


X.shape

(8099, 150)