# Кодирование качественных признаков

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
df = pd.read_csv("datasets/gender_classification.csv")
cols = ['forehead_height_cm', 'forehead_width_cm', 'lips_thin', 'nose_wide', 'nose_long', 'distance_nose_to_lip_long']
df = df[cols]
df

Unnamed: 0,forehead_height_cm,forehead_width_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long
0,6.1,11.8,1,1,0,1
1,5.4,14.0,1,0,0,0
2,6.3,11.8,1,1,1,1
3,6.1,14.4,1,0,1,1
4,5.9,13.5,0,0,0,0
...,...,...,...,...,...,...
4996,5.1,13.6,0,0,0,0
4997,5.4,11.9,0,0,0,0
4998,5.7,12.9,0,0,0,0
4999,6.2,13.2,0,0,0,0


### Выбираем столбцы с индаксами с 8 до 12

In [3]:
X_train = df.iloc[:, :]
X_train

Unnamed: 0,forehead_height_cm,forehead_width_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long
0,6.1,11.8,1,1,0,1
1,5.4,14.0,1,0,0,0
2,6.3,11.8,1,1,1,1
3,6.1,14.4,1,0,1,1
4,5.9,13.5,0,0,0,0
...,...,...,...,...,...,...
4996,5.1,13.6,0,0,0,0
4997,5.4,11.9,0,0,0,0
4998,5.7,12.9,0,0,0,0
4999,6.2,13.2,0,0,0,0


### StandardScaler используется для масштабирования данных. 

In [4]:
scaler = StandardScaler() #создает объект StandardScaler
scaler.fit(X_train) #подгоняет scaler к данным в X_train с помощью метода fit
scaler

### Выводим математическое ожидание и дисперсию для каждого признака и преобразовываем набор данных с использованием scaler. 

In [5]:
print('Мат ожидание', scaler.mean_)
print('Дисперсия', scaler.var_)
print('Преобразованный набор')
arr = scaler.transform(X_train)
DfTr = pd.DataFrame(arr, columns=cols)
DfTr

Мат ожидание [ 5.94631074 13.1814837   0.49310138  0.49390122  0.50789842  0.49890022]
Дисперсия [0.2929123  1.22548798 0.24995241 0.2499628  0.24993761 0.24999879]
Преобразованный набор


Unnamed: 0,forehead_height_cm,forehead_width_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long
0,0.283971,-1.247933,1.013894,1.012273,-1.015924,1.002202
1,-1.009418,0.739389,1.013894,-0.987876,-1.015924,-0.997803
2,0.653511,-1.247933,1.013894,1.012273,0.984326,1.002202
3,0.283971,1.100720,1.013894,-0.987876,0.984326,1.002202
4,-0.085568,0.287725,-0.986297,-0.987876,-1.015924,-0.997803
...,...,...,...,...,...,...
4996,-1.563727,0.378057,-0.986297,-0.987876,-1.015924,-0.997803
4997,-1.009418,-1.157600,-0.986297,-0.987876,-1.015924,-0.997803
4998,-0.455108,-0.254272,-0.986297,-0.987876,-1.015924,-0.997803
4999,0.468741,0.016726,-0.986297,-0.987876,-1.015924,-0.997803


In [6]:
df.describe()

Unnamed: 0,forehead_height_cm,forehead_width_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long
count,5001.0,5001.0,5001.0,5001.0,5001.0,5001.0
mean,5.946311,13.181484,0.493101,0.493901,0.507898,0.4989
std,0.541268,1.107128,0.500002,0.500013,0.499988,0.500049
min,5.1,11.4,0.0,0.0,0.0,0.0
25%,5.5,12.2,0.0,0.0,0.0,0.0
50%,5.9,13.1,0.0,0.0,1.0,0.0
75%,6.4,14.0,1.0,1.0,1.0,1.0
max,7.1,15.5,1.0,1.0,1.0,1.0


### Создаем новый столбец forehead_width и разбиваем значение столбца forehead_width_cm на три категории.

In [7]:
df['forehead_width'] = pd.cut(df['forehead_width_cm'], bins=[0, 12, 14, 20], labels=["Маленький", "Средний", "Большой"])
df

Unnamed: 0,forehead_height_cm,forehead_width_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long,forehead_width
0,6.1,11.8,1,1,0,1,Маленький
1,5.4,14.0,1,0,0,0,Средний
2,6.3,11.8,1,1,1,1,Маленький
3,6.1,14.4,1,0,1,1,Большой
4,5.9,13.5,0,0,0,0,Средний
...,...,...,...,...,...,...,...
4996,5.1,13.6,0,0,0,0,Средний
4997,5.4,11.9,0,0,0,0,Маленький
4998,5.7,12.9,0,0,0,0,Средний
4999,6.2,13.2,0,0,0,0,Средний


In [8]:
df.dtypes

forehead_height_cm            float64
forehead_width_cm             float64
lips_thin                       int64
nose_wide                       int64
nose_long                       int64
distance_nose_to_lip_long       int64
forehead_width               category
dtype: object

In [9]:
df["forehead_width"].value_counts().sort_index()

Маленький    1002
Средний      2780
Большой      1219
Name: forehead_width, dtype: int64

### Выбирает столбец с типом данных 'category':

In [10]:
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include='category')
categorical_columns = categorical_columns_selector(df)
categorical_columns

['forehead_width']

In [11]:
df_cat = df[categorical_columns] #отберем только те столбцы в которых присутствует категориальная перемнная
df_cat.head()

Unnamed: 0,forehead_width
0,Маленький
1,Средний
2,Маленький
3,Большой
4,Средний


### Преобразовываем категориальные значения в числовые:

In [12]:
from sklearn.preprocessing import OrdinalEncoder

cat_column = df_cat[["forehead_width"]]

print(df_cat['forehead_width'].unique()) #посмотрим какие виды образования присутствуют в выборке

encoder = OrdinalEncoder()   # вызываем кодировщик
encoder  = encoder.fit(df_cat[['forehead_width']])
df_cat['forehead_width'] = encoder.transform(df_cat[['forehead_width']])
df_cat

['Маленький', 'Средний', 'Большой']
Categories (3, object): ['Маленький' < 'Средний' < 'Большой']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat['forehead_width'] = encoder.transform(df_cat[['forehead_width']])


Unnamed: 0,forehead_width
0,1.0
1,2.0
2,1.0
3,0.0
4,2.0
...,...
4996,2.0
4997,1.0
4998,2.0
4999,2.0


In [13]:
print(encoder.categories_)
print(len(encoder.categories_[0]))
print(df_cat['forehead_width'].unique())
df_cat['forehead_width'].unique().size #количество уникальных значений в столбце

[array(['Большой', 'Маленький', 'Средний'], dtype=object)]
3
[1. 2. 0.]


3

### Итоговая выборка:


In [14]:
df['forehead_width_num'] = df_cat['forehead_width']
df

Unnamed: 0,forehead_height_cm,forehead_width_cm,lips_thin,nose_wide,nose_long,distance_nose_to_lip_long,forehead_width,forehead_width_num
0,6.1,11.8,1,1,0,1,Маленький,1.0
1,5.4,14.0,1,0,0,0,Средний,2.0
2,6.3,11.8,1,1,1,1,Маленький,1.0
3,6.1,14.4,1,0,1,1,Большой,0.0
4,5.9,13.5,0,0,0,0,Средний,2.0
...,...,...,...,...,...,...,...,...
4996,5.1,13.6,0,0,0,0,Средний,2.0
4997,5.4,11.9,0,0,0,0,Маленький,1.0
4998,5.7,12.9,0,0,0,0,Средний,2.0
4999,6.2,13.2,0,0,0,0,Средний,2.0
