In [3]:
import pandas as pd

horse = pd.read_csv('horse_data.csv', na_values = '?')
new_horse = horse.iloc[:, [0, 1, 3, 4, 5, 6, 10, 22]]
new_horse.columns = ['surgery', 'Age', 'rectal_temperature', 'pulse', 'respiratory_rate', 'temperature_of_extremities', 'pain', 'outcome']
new_horse.head()

Unnamed: 0,surgery,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
0,1.0,1,39.2,88.0,20.0,,3.0,3.0
1,2.0,1,38.3,40.0,24.0,1.0,3.0,1.0
2,1.0,9,39.1,164.0,84.0,4.0,2.0,2.0
3,2.0,1,37.3,104.0,35.0,,,2.0
4,2.0,1,,,,2.0,2.0,1.0


#### Общая статистика датафрейма

In [4]:
new_horse.describe()

Unnamed: 0,surgery,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
count,298.0,299.0,239.0,275.0,241.0,243.0,244.0,298.0
mean,1.395973,1.64214,38.166527,71.934545,30.427386,2.345679,2.942623,1.550336
std,0.489881,2.1773,0.733508,28.680522,17.678256,1.046369,1.303993,0.737967
min,1.0,1.0,35.4,30.0,8.0,1.0,1.0,1.0
25%,1.0,1.0,37.8,48.0,18.0,1.0,2.0,1.0
50%,1.0,1.0,38.2,64.0,24.0,3.0,3.0,1.0
75%,2.0,1.0,38.5,88.0,36.0,3.0,4.0,2.0
max,2.0,9.0,40.8,184.0,96.0,4.0,5.0,3.0


#### Доля пропусков в столбцах

In [5]:
(new_horse.isna().mean() * 100).round(2)

surgery                        0.33
Age                            0.00
rectal_temperature            20.07
pulse                          8.03
respiratory_rate              19.40
temperature_of_extremities    18.73
pain                          18.39
outcome                        0.33
dtype: float64

#### Фильтр на пустое значение в столбце Операция

In [6]:
new_horse.loc[pd.isnull(new_horse.surgery), :]

Unnamed: 0,surgery,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
131,,1,38.0,48.0,20.0,3.0,4.0,


#### Замена пустого значения в операции на "Нет инфо"

In [7]:
new_horse['surgery'].fillna('no info', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


#### Ректальная температура

In [8]:
# найдем выбросы по графе ректальная температура
q1 = new_horse['rectal_temperature'].quantile(0.25)
q3 = new_horse['rectal_temperature'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr) 
upper_bound = q3 + (1.5 * iqr)
remove_outliers = new_horse[new_horse['rectal_temperature'].between(lower_bound, upper_bound, inclusive=True)].sort_values('rectal_temperature')
remove_outliers

Unnamed: 0,surgery,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
112,1,1,36.8,60.0,28.0,,,2.0
276,2,1,36.9,50.0,40.0,2.0,1.0,1.0
291,1,1,37.0,66.0,20.0,1.0,4.0,2.0
69,2,1,37.0,60.0,20.0,3.0,3.0,3.0
262,1,1,37.1,75.0,36.0,,4.0,2.0
...,...,...,...,...,...,...,...,...
119,1,1,39.4,54.0,66.0,1.0,2.0,1.0
286,2,9,39.5,84.0,30.0,,,1.0
161,2,1,39.5,60.0,10.0,3.0,3.0,1.0
230,2,1,39.5,92.0,28.0,3.0,5.0,2.0


In [9]:
# информация о выбросах по графе ректальная температура
pd.concat([new_horse, remove_outliers]).drop_duplicates(keep=False).describe()

Unnamed: 0,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
count,61.0,14.0,50.0,39.0,51.0,50.0,61.0
mean,1.655738,38.15,82.54,32.435897,2.745098,3.46,1.819672
std,2.21273,2.048545,26.785017,13.764552,0.890913,1.328034,0.785455
min,1.0,35.4,36.0,12.0,1.0,1.0,1.0
25%,1.0,36.425,60.0,24.0,3.0,2.0,1.0
50%,1.0,38.1,80.0,32.0,3.0,4.0,2.0
75%,1.0,39.975,103.0,36.0,3.0,5.0,2.0
max,9.0,40.8,140.0,70.0,4.0,5.0,3.0


In [10]:
# заполняем пустые значение в столбце ректальная температура медианой в зависимости от наличия оперативного вмешательства, возраста лошади и исхода
print(new_horse.groupby(['surgery', 'Age', 'outcome'])['rectal_temperature'].median())
new_horse['rectal_temperature'].fillna(new_horse.groupby(['surgery', 'Age', 'outcome'])['rectal_temperature'].transform('median'), inplace = True)

surgery  Age  outcome
1.0      1    1.0        38.20
              2.0        38.00
              3.0        38.00
         9    1.0        38.25
              2.0        38.80
              3.0        39.70
2.0      1    1.0        38.20
              2.0        37.85
              3.0        38.20
         9    1.0        38.80
              2.0        38.00
Name: rectal_temperature, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


#### Пульс

In [11]:
# фильтр на пустое значение в столбце Пульс
new_horse.loc[pd.isnull(new_horse.pulse), :].head()

Unnamed: 0,surgery,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
4,2,1,38.2,,,2.0,2.0,1.0
27,1,1,38.0,,,,,2.0
51,2,1,38.2,,,1.0,1.0,1.0
55,1,1,38.2,,,,,1.0
57,1,1,38.0,,20.0,4.0,5.0,2.0


In [12]:
# заполним пустые значение в графе пульс медианой в зависимости от наличия оперативного вмешательства, возраста лошади и исхода
print(new_horse.groupby(['surgery', 'Age', 'outcome'])['pulse'].median())
new_horse['pulse'].fillna(new_horse.groupby(['surgery', 'Age', 'outcome'])['pulse'].transform('median'), inplace = True)

surgery  Age  outcome
1.0      1    1.0         60.0
              2.0         73.5
              3.0         79.0
         9    1.0        122.0
              2.0        146.0
              3.0        100.0
2.0      1    1.0         50.0
              2.0         92.0
              3.0         85.0
         9    1.0         90.0
              2.0         92.0
Name: pulse, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [13]:
new_horse.describe()

Unnamed: 0,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
count,299.0,299.0,299.0,241.0,243.0,244.0,298.0
mean,1.64214,38.158194,71.963211,30.427386,2.345679,2.942623,1.550336
std,2.1773,0.661533,28.238711,17.678256,1.046369,1.303993,0.737967
min,1.0,35.4,30.0,8.0,1.0,1.0,1.0
25%,1.0,37.85,48.0,18.0,1.0,2.0,1.0
50%,1.0,38.2,64.0,24.0,3.0,3.0,1.0
75%,1.0,38.5,88.0,36.0,3.0,4.0,2.0
max,9.0,40.8,184.0,96.0,4.0,5.0,3.0


#### Частота дыхания

In [14]:
# блоки датафрейма, в котором отсутствуют данные по частоте дыхания
new_horse.loc[pd.isnull(new_horse.respiratory_rate), :].describe()

Unnamed: 0,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
count,58.0,58.0,58.0,0.0,43.0,44.0,58.0
mean,1.551724,38.184483,76.301724,,2.348837,3.295455,1.603448
std,2.044869,0.577366,24.871589,,0.973059,1.456015,0.724015
min,1.0,36.1,40.0,,1.0,1.0,1.0
25%,1.0,38.0,57.0,,1.0,2.0,1.0
50%,1.0,38.2,73.5,,3.0,3.0,1.0
75%,1.0,38.475,92.0,,3.0,5.0,2.0
max,9.0,40.0,146.0,,4.0,5.0,3.0


In [15]:
# заполняем пустые значение в графе частота дыхания медианой в зависимости от наличия оперативного вмешательства, возраста лошади
print(new_horse.groupby(['surgery', 'Age', 'outcome'])['respiratory_rate'].median())
new_horse['respiratory_rate'].fillna(new_horse.groupby(['surgery', 'Age', 'outcome'])['respiratory_rate'].transform('median'), inplace = True)

surgery  Age  outcome
1.0      1    1.0        24.0
              2.0        26.0
              3.0        32.0
         9    1.0        69.0
              2.0        48.0
              3.0         NaN
2.0      1    1.0        24.0
              2.0        30.0
              3.0        20.0
         9    1.0        37.0
              2.0        28.0
Name: respiratory_rate, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [16]:
new_horse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   surgery                     299 non-null    object 
 1   Age                         299 non-null    int64  
 2   rectal_temperature          299 non-null    float64
 3   pulse                       299 non-null    float64
 4   respiratory_rate            298 non-null    float64
 5   temperature_of_extremities  243 non-null    float64
 6   pain                        244 non-null    float64
 7   outcome                     298 non-null    float64
dtypes: float64(6), int64(1), object(1)
memory usage: 18.8+ KB


In [17]:
# замена пустого значения в частоте дыхания на "Нет инфо"
# в дальнейшем эту графу можно вовсе игнорировать, т.к. ее данные не информативны в силу широкого разброса
new_horse['respiratory_rate'].fillna('no info', inplace = True)

#### Температура конечностей, сила болевых ощущений и исход
по оставшимся 3-ем параметрам заполняем пропуски "Нет инфы", т.к. это категориальыне величины, 
и присвоение медианных значений нулевым показателям может существенно исказить результаты

In [18]:
new_horse['temperature_of_extremities'].fillna('no info', inplace = True)

In [19]:
new_horse['pain'].fillna('no info', inplace = True)

In [20]:
new_horse['outcome'].fillna('no info', inplace = True)

In [21]:
new_horse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   surgery                     299 non-null    object 
 1   Age                         299 non-null    int64  
 2   rectal_temperature          299 non-null    float64
 3   pulse                       299 non-null    float64
 4   respiratory_rate            299 non-null    object 
 5   temperature_of_extremities  299 non-null    object 
 6   pain                        299 non-null    object 
 7   outcome                     299 non-null    object 
dtypes: float64(2), int64(1), object(5)
memory usage: 18.8+ KB


In [22]:
new_horse.head(20)

Unnamed: 0,surgery,Age,rectal_temperature,pulse,respiratory_rate,temperature_of_extremities,pain,outcome
0,1,1,39.2,88.0,20,no info,3,3
1,2,1,38.3,40.0,24,1,3,1
2,1,9,39.1,164.0,84,4,2,2
3,2,1,37.3,104.0,35,no info,no info,2
4,2,1,38.2,50.0,24,2,2,1
5,1,1,37.9,48.0,16,1,3,1
6,1,1,38.0,60.0,26,3,no info,2
7,2,1,38.2,80.0,36,3,4,3
8,2,9,38.3,90.0,37,1,5,1
9,1,1,38.1,66.0,12,3,3,1
