# Feature engineering

In [1]:
import pandas as pd

## Introducción

## Transformación de los datos

### Transformación con map



In [2]:
data = pd.DataFrame({'vegetales': ['lechuga', 'tomate', 'morrón rojo',
                                   'morrón verde', 'brócoli', 'acelga',
                                   'zapallo', 'papa', 'espinaca'],
                     'precio_kg': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,vegetales,precio_kg
0,lechuga,4.0
1,tomate,3.0
2,morrón rojo,12.0
3,morrón verde,6.0
4,brócoli,7.5
5,acelga,8.0
6,zapallo,3.0
7,papa,5.0
8,espinaca,6.0


In [3]:
vegetal_a_color = {
      'lechuga': 'verde',
      'tomate': 'rojo',
      'morrón rojo': 'rojo',
      'morrón verde': 'verde',
      'brócoli': 'verde',
      'zapallo': 'naranja'
}

In [4]:
data['color'] = data['vegetales'].map(vegetal_a_color)
data

Unnamed: 0,vegetales,precio_kg,color
0,lechuga,4.0,verde
1,tomate,3.0,rojo
2,morrón rojo,12.0,rojo
3,morrón verde,6.0,verde
4,brócoli,7.5,verde
5,acelga,8.0,
6,zapallo,3.0,naranja
7,papa,5.0,
8,espinaca,6.0,


### Discretización y binning


In [5]:
edades = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]



In [6]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(edades, bins)

In [7]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [8]:
edades

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [9]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

### Variables dummies


In [10]:
df = pd.DataFrame({'animales': ['jirafa', 'jirafa', 'koala',
                                'oso', 'koala', 'jirafa'], 
                   'unDato': range(6)})

In [11]:
df

Unnamed: 0,animales,unDato
0,jirafa,0
1,jirafa,1
2,koala,2
3,oso,3
4,koala,4
5,jirafa,5


In [12]:
pd.get_dummies(df)

Unnamed: 0,unDato,animales_jirafa,animales_koala,animales_oso
0,0,1,0,0
1,1,1,0,0
2,2,0,1,0
3,3,0,0,1
4,4,0,1,0
5,5,1,0,0


### Label Encoder (scikit learn)


In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])

list(le.classes_)



['amsterdam', 'paris', 'tokyo']

In [3]:
le.transform(["tokyo", "tokyo", "amsterdam"]) 



array([2, 2, 0], dtype=int64)

In [4]:
le.transform(["tokyo", "tokyo", "paris"])

array([2, 2, 1], dtype=int64)

In [5]:
integer_encoded = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])

In [6]:
integer_encoded

array([1, 1, 2, 0], dtype=int64)

### OneHotEncoder


In [7]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
print(integer_encoded)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)


[[1]
 [1]
 [2]
 [0]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [8]:
onehot_encoded

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

### Imputer

In [18]:
from sklearn.impute import SimpleImputer

In [10]:
from numpy import nan
import numpy as np
X = np.array([[ nan, 0, 98 ],
              [3, 7, 99 ], 
              [3, 5, 210 ], 
              [4, nan,202 ], 
              [8, 8, 101 ]])
X

array([[ nan,   0.,  98.],
       [  3.,   7.,  99.],
       [  3.,   5., 210.],
       [  4.,  nan, 202.],
       [  8.,   8., 101.]])

In [19]:
imp = SimpleImputer(strategy='mean')
X2 = imp.fit_transform(X)

In [20]:
X2

array([[  4.5,   0. ,  98. ],
       [  3. ,   7. ,  99. ],
       [  3. ,   5. , 210. ],
       [  4. ,   5. , 202. ],
       [  8. ,   8. , 101. ]])

### Pipelines

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(SimpleImputer(strategy='mean'),
                     StandardScaler())

In [23]:
X

array([[ nan,   0.,  98.],
       [  3.,   7.,  99.],
       [  3.,   5., 210.],
       [  4.,  nan, 202.],
       [  8.,   8., 101.]])

In [24]:
pipe.fit(X)

Pipeline(memory=None,
     steps=[('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])

In [25]:
pipe.fit_transform(X)

array([[ 0.        , -1.81369063, -0.84088374],
       [-0.81348922,  0.72547625, -0.82177275],
       [-0.81348922,  0.        ,  1.2995476 ],
       [-0.27116307,  0.        ,  1.14665965],
       [ 1.89814151,  1.08821438, -0.78355076]])

## Detección de outliers

Dos tipos:

- **Univariantes:** referido a una única columna del dataframe

- **Multivariantes:** depende de múltiples columnas. Quizás no se ve a simple vista.

### Filtrando y transformando outliers

In [None]:
import seaborn as sns

In [None]:
sns.set(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.boxplot(x=tips["total_bill"])

In [None]:
q75, q25 = np.percentile(tips.total_bill, [75 ,25])
iqr = q75 - q25
 
min = q25 - (iqr*1.5)
max = q75 + (iqr*1.5)

In [None]:
tips.total_bill[tips.total_bill > max]

In [None]:
tips.total_bill[tips.total_bill < max]