# Кодирование качественных признаков

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
df = pd.read_csv("datasets/fifa_players.csv")
df.head(10)

Unnamed: 0,name,full_name,birth_date,age,height_cm,weight_kgs,positions,nationality,overall_rating,potential,...,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle
0,L. Messi,Lionel Andrés Messi Cuccittini,6/24/1987,31,170.18,72.1,"CF,RW,ST",Argentina,94,94,...,94,48,22,94,94,75,96,33,28,26
1,C. Eriksen,Christian Dannemann Eriksen,2/14/1992,27,154.94,76.2,"CAM,RM,CM",Denmark,88,89,...,89,46,56,84,91,67,88,59,57,22
2,P. Pogba,Paul Pogba,3/15/1993,25,190.5,83.9,"CM,CAM",France,88,91,...,82,78,64,82,88,82,87,63,67,67
3,L. Insigne,Lorenzo Insigne,6/4/1991,27,162.56,59.0,"LW,ST",Italy,88,88,...,84,34,26,83,87,61,83,51,24,22
4,K. Koulibaly,Kalidou Koulibaly,6/20/1991,27,187.96,88.9,CB,Senegal,88,91,...,15,87,88,24,49,33,80,91,88,87
5,V. van Dijk,Virgil van Dijk,7/8/1991,27,193.04,92.1,CB,Netherlands,88,90,...,64,82,88,41,60,62,87,90,89,84
6,K. Mbappé,Kylian Mbappé,12/20/1998,20,152.4,73.0,"RW,ST,RM",France,88,95,...,78,62,38,88,82,70,86,34,34,32
7,S. Agüero,Sergio Leonel Agüero del Castillo,6/2/1988,30,172.72,69.9,ST,Argentina,89,89,...,83,65,24,92,83,83,90,30,20,12
8,M. Neuer,Manuel Neuer,3/27/1986,32,193.04,92.1,GK,Germany,89,89,...,16,29,30,12,70,47,70,17,10,11
9,E. Cavani,Edinson Roberto Cavani Gómez,2/14/1987,32,185.42,77.1,ST,Uruguay,89,89,...,79,84,48,93,77,85,82,52,45,39


### Выбираем столбцы с индаксами с 8 до 12

In [27]:
X_train = df.iloc[:,30:35]
X_train

Unnamed: 0,long_passing,ball_control,acceleration,sprint_speed,agility
0,89,96,91,86,93
1,89,91,76,73,80
2,90,90,71,79,76
3,78,93,94,86,94
4,60,63,70,75,50
...,...,...,...,...,...
17949,61,66,83,86,93
17950,43,53,70,64,54
17951,24,18,27,22,29
17952,37,56,84,86,65


### StandardScaler используется для масштабирования данных. 

In [28]:
scaler = StandardScaler() #создает объект StandardScaler
scaler.fit(X_train) #подгоняет scaler к данным в X_train с помощью метода fit

### Выводим математическое ожидание и дисперсию для каждого признака и преобразовываем набор данных с использованием scaler. 

In [29]:
print('Мат ожидание', scaler.mean_)
print('Дисперсия', scaler.var_)
print('Преобразованный набор')
arr = scaler.transform(X_train)
DfTr = pd.DataFrame(arr, columns=['long_passing', 'ball_control',	'acceleration',	'sprint_speed',	'agility'])
DfTr

Мат ожидание [52.66742787 58.22390554 64.69622368 64.80349783 63.37746463]
Дисперсия [237.81080605 284.77397236 225.02780461 216.74728419 219.23342554]
Преобразованный набор


Unnamed: 0,long_passing,ball_control,acceleration,sprint_speed,agility
0,2.356027,2.238551,1.753477,1.439751,2.000640
1,2.356027,1.942259,0.753539,0.556739,1.122649
2,2.420874,1.883000,0.420226,0.964283,0.852498
3,1.642720,2.060775,1.953464,1.439751,2.068177
4,0.475489,0.283024,0.353563,0.692587,-0.903484
...,...,...,...,...,...
17949,0.540335,0.460799,1.220176,1.439751,2.000640
17950,-0.626895,-0.309560,0.353563,-0.054577,-0.633333
17951,-1.858972,-2.383604,-2.512926,-2.907384,-2.321777
17952,-1.015972,-0.131785,1.286839,1.439751,0.109582


### Создаем новый столбец cost_estimate и разбиваем значение столбца price_range на три категории.

In [34]:
df['sliding_quality'] = pd.cut(df['sliding_tackle'], bins=[0, 40, 60, 100], labels=["Плохой", "Средний", "Хороший"])
df

Unnamed: 0,name,full_name,birth_date,age,height_cm,weight_kgs,positions,nationality,overall_rating,potential,...,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,sliding_quality
0,L. Messi,Lionel Andrés Messi Cuccittini,6/24/1987,31,170.18,72.1,"CF,RW,ST",Argentina,94,94,...,48,22,94,94,75,96,33,28,26,Плохой
1,C. Eriksen,Christian Dannemann Eriksen,2/14/1992,27,154.94,76.2,"CAM,RM,CM",Denmark,88,89,...,46,56,84,91,67,88,59,57,22,Плохой
2,P. Pogba,Paul Pogba,3/15/1993,25,190.50,83.9,"CM,CAM",France,88,91,...,78,64,82,88,82,87,63,67,67,Хороший
3,L. Insigne,Lorenzo Insigne,6/4/1991,27,162.56,59.0,"LW,ST",Italy,88,88,...,34,26,83,87,61,83,51,24,22,Плохой
4,K. Koulibaly,Kalidou Koulibaly,6/20/1991,27,187.96,88.9,CB,Senegal,88,91,...,87,88,24,49,33,80,91,88,87,Хороший
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17949,R. McKenzie,Rory McKenzie,10/7/1993,25,175.26,74.8,"RM,CAM,CM",Scotland,67,70,...,69,41,60,64,63,56,40,20,18,Плохой
17950,M. Sipľak,Michal Sipľak,2/2/1996,23,182.88,79.8,LB,Slovakia,59,67,...,62,55,42,39,32,52,53,64,60,Средний
17951,J. Bekkema,Jan Bekkema,4/9/1996,22,185.42,89.8,GK,Netherlands,59,67,...,27,10,5,25,16,47,9,12,13,Плохой
17952,A. Al Yami,Abdulrahman Al Yami,6/19/1997,21,175.26,64.9,"ST,LM",Saudi Arabia,59,71,...,38,15,54,52,50,53,16,18,17,Плохой


In [35]:
df.dtypes

name                               object
full_name                          object
birth_date                         object
age                                 int64
height_cm                         float64
weight_kgs                        float64
positions                          object
nationality                        object
overall_rating                      int64
potential                           int64
value_euro                        float64
wage_euro                         float64
preferred_foot                     object
international_reputation(1-5)       int64
weak_foot(1-5)                      int64
skill_moves(1-5)                    int64
body_type                          object
release_clause_euro               float64
national_team                      object
national_rating                   float64
national_team_position             object
national_jersey_number            float64
crossing                            int64
finishing                         

In [36]:
df["sliding_quality"].value_counts().sort_index()

Плохой     7156
Средний    4553
Хороший    6245
Name: sliding_quality, dtype: int64

### Выбирает столбец с типом данных 'category':

In [37]:
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include='category')
categorical_columns = categorical_columns_selector(df)
categorical_columns

['sliding_quality']

In [38]:
df_cat = df[categorical_columns] #отберем только те столбцы в которых присутствует категориальная перемнная
df_cat.head()

Unnamed: 0,sliding_quality
0,Плохой
1,Плохой
2,Хороший
3,Плохой
4,Хороший


### Преобразовываем категориальные значения в числовые:

In [42]:
from sklearn.preprocessing import OrdinalEncoder

cat_column = df_cat[["sliding_quality"]]

print(df_cat['sliding_quality'].unique()) #посмотрим какие виды образования присутствуют в выборке

encoder = OrdinalEncoder()   # вызываем кодировщик
encoder  = encoder.fit(df_cat[['sliding_quality']])
df_cat['sliding_quality'] = encoder.transform(df_cat[['sliding_quality']])
df_cat

[0. 2. 1.]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat['sliding_quality'] = encoder.transform(df_cat[['sliding_quality']])


Unnamed: 0,sliding_quality
0,0.0
1,0.0
2,2.0
3,0.0
4,2.0
...,...
17949,0.0
17950,1.0
17951,0.0
17952,0.0


In [43]:
print(encoder.categories_)
print(len(encoder.categories_[0]))
print(df_cat['sliding_quality'].unique())
df_cat['sliding_quality'].unique().size #количество уникальных значений в столбце

[array([0., 1., 2.])]
3
[0. 2. 1.]


3