# Кодирование качественных признаков

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

In [2]:
df = pd.read_csv("datasets/gender_classification.csv")
cols = ['forehead_height_cm', 'forehead_width_cm', 'lips_thin', 'nose_wide', 'nose_long', 'distance_nose_to_lip_long']
df = df[cols]
df

Unnamed: 0,forehead_height_cm,forehead_width_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long
0,6.1,11.8,1,1,0,1
1,5.4,14.0,1,0,0,0
2,6.3,11.8,1,1,1,1
3,6.1,14.4,1,0,1,1
4,5.9,13.5,0,0,0,0
...,...,...,...,...,...,...
4996,5.1,13.6,0,0,0,0
4997,5.4,11.9,0,0,0,0
4998,5.7,12.9,0,0,0,0
4999,6.2,13.2,0,0,0,0


### Выбираем признаки для масштабирования

In [3]:
cols_for_scale = ['forehead_height_cm', 'lips_thin', 'nose_wide', 'nose_long', 'distance_nose_to_lip_long']
X_train = df[cols_for_scale]
X_train

Unnamed: 0,forehead_height_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long
0,6.1,1,1,0,1
1,5.4,1,0,0,0
2,6.3,1,1,1,1
3,6.1,1,0,1,1
4,5.9,0,0,0,0
...,...,...,...,...,...
4996,5.1,0,0,0,0
4997,5.4,0,0,0,0
4998,5.7,0,0,0,0
4999,6.2,0,0,0,0


### MinMaxScaler используется для масштабирования данных. 

In [4]:
# scaler = StandardScaler() #создает объект StandardScaler
scaler = MinMaxScaler()
scaler.fit(X_train) #подгоняет scaler к данным в X_train с помощью метода fit
scaler

### Преобразовываем набор данных с использованием scaler. 

In [5]:
arr = scaler.transform(X_train)
DfTr = pd.DataFrame(arr, columns=cols_for_scale)
DfTr

Unnamed: 0,forehead_height_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long
0,0.50,1.0,1.0,0.0,1.0
1,0.15,1.0,0.0,0.0,0.0
2,0.60,1.0,1.0,1.0,1.0
3,0.50,1.0,0.0,1.0,1.0
4,0.40,0.0,0.0,0.0,0.0
...,...,...,...,...,...
4996,0.00,0.0,0.0,0.0,0.0
4997,0.15,0.0,0.0,0.0,0.0
4998,0.30,0.0,0.0,0.0,0.0
4999,0.55,0.0,0.0,0.0,0.0


In [6]:
DfTr.describe()

Unnamed: 0,forehead_height_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long
count,5001.0,5001.0,5001.0,5001.0,5001.0
mean,0.423155,0.493101,0.493901,0.507898,0.4989
std,0.270634,0.500002,0.500013,0.499988,0.500049
min,0.0,0.0,0.0,0.0,0.0
25%,0.2,0.0,0.0,0.0,0.0
50%,0.4,0.0,0.0,1.0,0.0
75%,0.65,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0


### Создаем новый столбец forehead_width и разбиваем значение столбца forehead_width_cm на три категории.

In [7]:
df['forehead_width'] = pd.cut(df['forehead_width_cm'], bins=[0, 12, 14, 20], labels=["Маленький", "Средний", "Большой"])
df = df.drop('forehead_width_cm', axis=1)
df

Unnamed: 0,forehead_height_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long,forehead_width
0,6.1,1,1,0,1,Маленький
1,5.4,1,0,0,0,Средний
2,6.3,1,1,1,1,Маленький
3,6.1,1,0,1,1,Большой
4,5.9,0,0,0,0,Средний
...,...,...,...,...,...,...
4996,5.1,0,0,0,0,Средний
4997,5.4,0,0,0,0,Маленький
4998,5.7,0,0,0,0,Средний
4999,6.2,0,0,0,0,Средний


In [8]:
df.dtypes

forehead_height_cm            float64
lips_thin                       int64
nose_wide                       int64
nose_long                       int64
distance_nose_to_lip_long       int64
forehead_width               category
dtype: object

In [9]:
df["forehead_width"].value_counts().sort_index()

Маленький    1002
Средний      2780
Большой      1219
Name: forehead_width, dtype: int64

### Выбирает столбец с типом данных 'category':

In [10]:
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include='category')
categorical_columns = categorical_columns_selector(df)
categorical_columns

['forehead_width']

In [11]:
df_cat = df[categorical_columns] #отберем только те столбцы в которых присутствует категориальная перемнная
df_cat.head()

Unnamed: 0,forehead_width
0,Маленький
1,Средний
2,Маленький
3,Большой
4,Средний


### Преобразовываем категориальные значения в числовые:

In [12]:
from sklearn.preprocessing import OrdinalEncoder

cat_column = df_cat[["forehead_width"]]

print(df_cat['forehead_width'].unique()) #посмотрим какие виды образования присутствуют в выборке

encoder = OrdinalEncoder()   # вызываем кодировщик
encoder  = encoder.fit(df_cat[['forehead_width']])
df_cat['forehead_width'] = encoder.transform(df_cat[['forehead_width']])
df_cat

['Маленький', 'Средний', 'Большой']
Categories (3, object): ['Маленький' < 'Средний' < 'Большой']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat['forehead_width'] = encoder.transform(df_cat[['forehead_width']])


Unnamed: 0,forehead_width
0,1.0
1,2.0
2,1.0
3,0.0
4,2.0
...,...
4996,2.0
4997,1.0
4998,2.0
4999,2.0


In [13]:
print(encoder.categories_)
print(len(encoder.categories_[0]))
print(df_cat['forehead_width'].unique())
df_cat['forehead_width'].unique().size #количество уникальных значений в столбце

[array(['Большой', 'Маленький', 'Средний'], dtype=object)]
3
[1. 2. 0.]


3

### Итоговая выборка:


In [14]:
df['forehead_width'] = df_cat['forehead_width']
df[cols_for_scale] = arr
df

Unnamed: 0,forehead_height_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long,forehead_width
0,0.50,1.0,1.0,0.0,1.0,1.0
1,0.15,1.0,0.0,0.0,0.0,2.0
2,0.60,1.0,1.0,1.0,1.0,1.0
3,0.50,1.0,0.0,1.0,1.0,0.0
4,0.40,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...
4996,0.00,0.0,0.0,0.0,0.0,2.0
4997,0.15,0.0,0.0,0.0,0.0,1.0
4998,0.30,0.0,0.0,0.0,0.0,2.0
4999,0.55,0.0,0.0,0.0,0.0,2.0
