In [81]:
import numpy as np
import pandas as pd
from copy import copy
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')

In [82]:
data = pd.read_csv('DATA_STUDENTS.csv', header=None)

new_header = data.iloc[458]

data = data.rename(columns=new_header)

data = data.drop(459).reset_index(drop=True)

print(f"\nТипы данных в столбцах: \n{data.dtypes}")

#Данные, исходя из вывода имеют тип object. Меняем его на нормальный.
numeric_cols = ['Age_Y', 'Weight_Kg', 'Height_cm', 'BMI', 'Speed_m/s', 'Jump _distance_cm']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

print(f"\nТипы данных в столбцах: \n{data.dtypes}")



Типы данных в столбцах: 
ID                   object
Age_Y                object
Gender               object
Weight_Kg            object
Height_cm            object
BMI                  object
Speed_m/s            object
Jump _distance_cm    object
Region               object
dtype: object

Типы данных в столбцах: 
ID                    object
Age_Y                float64
Gender                object
Weight_Kg            float64
Height_cm            float64
BMI                  float64
Speed_m/s            float64
Jump _distance_cm    float64
Region                object
dtype: object


In [83]:
data


Unnamed: 0,ID,Age_Y,Gender,Weight_Kg,Height_cm,BMI,Speed_m/s,Jump _distance_cm,Region
0,371,12.0,male,30.0,138.0,16.0,6.17,321.0,Bogota
1,636,13.0,male,31.0,141.0,16.0,4.81,280.0,Bogota
2,666,13.0,male,31.0,142.0,15.0,5.43,310.0,Bogota
3,1548,13.0,male,31.0,142.0,15.0,4.70,320.0,Bogota
4,358,13.0,male,31.0,145.0,15.0,5.62,245.0,Bogota
...,...,...,...,...,...,...,...,...,...
1852,106,12.0,male,42.5,160.0,17.0,5.41,360.0,Bogota
1853,1693,11.0,male,42.5,160.0,17.0,5.40,360.0,Cairo
1854,1611,15.0,female,50.3,162.0,19.0,5.40,243.0,Riga
1855,310,14.0,female,58.5,155.0,24.0,5.33,270.0,Riga


In [84]:
# Размерность данных
print(f"Размер данных: {data.shape}")

# Пропуски
print("\nКоличество пропущенных значений:")
print(data.isnull().sum())

Размер данных: (1857, 9)

Количество пропущенных значений:
ID                   0
Age_Y                1
Gender               0
Weight_Kg            1
Height_cm            1
BMI                  1
Speed_m/s            8
Jump _distance_cm    1
Region               0
dtype: int64


Удаляем пропуски


In [85]:
# Удаляем строки с пропущенными значениями
data = data.dropna()

# Сбрасываем индексы после удаления строк
data = data.reset_index(drop=True)

# Проверяем количество пропусков после очистки
print("\nКоличество пропущенных значений после очистки:")
print(data.isnull().sum())


Количество пропущенных значений после очистки:
ID                   0
Age_Y                0
Gender               0
Weight_Kg            0
Height_cm            0
BMI                  0
Speed_m/s            0
Jump _distance_cm    0
Region               0
dtype: int64


Межквартильный размах определяется через квантили - разница между 75-м и 25-м процентилям.

In [86]:
#Да, функция для одного применения, но я предпочту сделать так, потому что можно будет сунуть в библиотеку.
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df.reset_index(drop=True)

# фильтрация
data_second_press = remove_outliers_iqr(data, numeric_cols)

# Проверка размерности
print("До удаления выбросов:", data.shape)
print("После удаления выбросов:", data_second_press.shape)

До удаления выбросов: (1849, 9)
После удаления выбросов: (1689, 9)


Борьба с шумами

In [87]:
def smooth_columns(df, columns, window=3):
    df_copy = df.copy()
    for col in columns:
        df_copy[col] = df_copy[col].rolling(window=window, center=True, min_periods=1).mean()
    return df_copy

data_smoothed = smooth_columns(data_second_press, numeric_cols)

Удаляем дубликаты (их не оказалось)

In [88]:
data_clean = data_second_press.drop_duplicates().reset_index(drop=True)

print(data_smoothed.shape)

(1689, 9)


Этот код, чтобы отмести котегориальные поля я просто взял со своего старого проекта:

https://github.com/delilit/Spotify

In [89]:
object_columns = data_clean.select_dtypes(include=['object']).columns
labels = {}

for col in object_columns:
    le = LabelEncoder()
    data_clean[col] = le.fit_transform(data_clean[col])
    labels[col] = le

data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1688 entries, 0 to 1687
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 1688 non-null   int64  
 1   Age_Y              1688 non-null   float64
 2   Gender             1688 non-null   int64  
 3   Weight_Kg          1688 non-null   float64
 4   Height_cm          1688 non-null   float64
 5   BMI                1688 non-null   float64
 6   Speed_m/s          1688 non-null   float64
 7   Jump _distance_cm  1688 non-null   float64
 8   Region             1688 non-null   int64  
dtypes: float64(6), int64(3)
memory usage: 118.8 KB


Создание признаков.
Зависимость скорость от всех трёх полей - Вес, рост и возраст является прекрасным призанаком для анализа. Можно поговорить о здоровье или статистических способностях к бегу.

In [90]:
data_clean['weight/speed'] = data_clean['Weight_Kg'] * data_clean['Speed_m/s']
data_clean['height/speed'] = data_clean['Height_cm'] * data_clean['Speed_m/s']
data_clean['age/speed'] = data_clean['Age_Y'] * data_clean['Speed_m/s']

data_clean

Unnamed: 0,ID,Age_Y,Gender,Weight_Kg,Height_cm,BMI,Speed_m/s,Jump _distance_cm,Region,weight/speed,height/speed,age/speed
0,1323,13.0,1,31.0,141.0,16.0,4.81,280.0,0,149.110,678.21,62.53
1,1353,13.0,1,31.0,142.0,15.0,5.43,310.0,0,168.330,771.06,70.59
2,562,13.0,1,31.0,142.0,15.0,4.70,320.0,0,145.700,667.40,61.10
3,1042,13.0,1,31.0,145.0,15.0,5.62,245.0,0,174.220,814.90,73.06
4,366,13.0,1,31.0,145.0,15.0,5.70,351.0,0,176.700,826.50,74.10
...,...,...,...,...,...,...,...,...,...,...,...,...
1683,63,12.0,1,42.5,160.0,17.0,5.41,360.0,0,229.925,865.60,64.92
1684,711,11.0,1,42.5,160.0,17.0,5.40,360.0,1,229.500,864.00,59.40
1685,630,15.0,0,50.3,162.0,19.0,5.40,243.0,3,271.620,874.80,81.00
1686,995,14.0,0,58.5,155.0,24.0,5.33,270.0,3,311.805,826.15,74.62


In [91]:
# Выбираем все числовые столбцы
numeric_cols = data_clean.select_dtypes(include=['float64', 'int64']).columns

# Создаем scaler
scaler = MinMaxScaler()


data_normalized = data_clean.copy()
data_normalized[numeric_cols] = scaler.fit_transform(data_clean[numeric_cols])

data_normalized

Unnamed: 0,ID,Age_Y,Gender,Weight_Kg,Height_cm,BMI,Speed_m/s,Jump _distance_cm,Region,weight/speed,height/speed,age/speed
0,0.786564,0.285714,1.0,0.083333,0.046512,0.272727,0.341808,0.342857,0.000000,0.048558,0.162030,0.213551
1,0.804400,0.285714,1.0,0.083333,0.069767,0.181818,0.516949,0.428571,0.000000,0.098708,0.291046,0.312301
2,0.334126,0.285714,1.0,0.083333,0.069767,0.181818,0.310734,0.457143,0.000000,0.039661,0.147010,0.196030
3,0.619501,0.285714,1.0,0.083333,0.139535,0.181818,0.570621,0.242857,0.000000,0.114077,0.351962,0.342563
4,0.217598,0.285714,1.0,0.083333,0.139535,0.181818,0.593220,0.545714,0.000000,0.120548,0.368080,0.355305
...,...,...,...,...,...,...,...,...,...,...,...,...
1683,0.037455,0.142857,1.0,0.322917,0.488372,0.363636,0.511299,0.571429,0.000000,0.259426,0.422410,0.242833
1684,0.422711,0.000000,1.0,0.322917,0.488372,0.363636,0.508475,0.571429,0.333333,0.258317,0.420187,0.175202
1685,0.374554,0.571429,0.0,0.485417,0.534884,0.545455,0.508475,0.237143,1.000000,0.368219,0.435193,0.439843
1686,0.591558,0.428571,0.0,0.656250,0.372093,1.000000,0.488701,0.314286,1.000000,0.473072,0.367594,0.361676
