In [2]:
import pandas as pd
import numpy as np

# Датасеты для примера:

[Vehicle dataset - регрессия](https://www.kaggle.com/datasets/nehalbirla/vehicle-dataset-from-cardekho)

[Heart Failure Prediction Dataset - классификация, ансамбли решающих деревьев](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction)

# Датасеты для выполнения заданий:

[House Prices - Advanced Regression Techniques - регрессия](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques)

[Titanic - Machine Learning from Disaster - классификация](https://www.kaggle.com/competitions/titanic)

[Digit Recognizer - ансамбли решающих деревьев](https://www.kaggle.com/competitions/digit-recognizer/data?select=train.csv)

In [39]:
df = pd.read_csv('/content/heart.csv')
display(df)
dataset_type = 'class'

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [40]:
# Проверка данных

print(df.isnull().sum())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [42]:
for i in df.columns:
    if "int" not in str(df[i].dtype) and "float" not in str(df[i].dtype):
        df[i] = pd.get_dummies(df[i]).to_numpy().argmax(axis=1)
display(df)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1,1
914,68,1,0,144,193,1,1,141,0,3.4,1,1
915,57,1,0,130,131,0,1,115,1,1.2,1,1
916,57,0,1,130,236,0,0,174,0,0.0,1,1


In [44]:
def normalization(X):
    for i in X.columns:
        X[i] = (X[i] - X[i].mean()) / X[i].std()
    return X

X_norm = normalization(df.drop('HeartDisease', axis = 1))
Y = df.HeartDisease
display(X_norm)
display(Y)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,-1.432359,0.515671,0.228907,0.410685,0.824621,-0.551041,0.017245,1.382175,-0.823108,-0.831979,1.051541
1,-0.478223,-1.937107,1.274364,1.490940,-0.171867,-0.551041,0.017245,0.753746,-0.823108,0.105606,-0.595753
2,-1.750404,0.515671,0.228907,-0.129442,0.769768,-0.551041,1.600347,-1.524307,-0.823108,-0.831979,1.051541
3,-0.584238,-1.937107,-0.816550,0.302660,0.138964,-0.551041,0.017245,-1.131539,1.213585,0.574398,-0.595753
4,0.051853,0.515671,1.274364,0.950812,-0.034736,-0.551041,0.017245,-0.581664,-0.823108,-0.831979,1.051541
...,...,...,...,...,...,...,...,...,...,...,...
913,-0.902283,0.515671,2.319822,-1.209697,0.596068,-0.551041,0.017245,-0.188897,-0.823108,0.293123,-0.595753
914,1.536064,0.515671,-0.816550,0.626736,-0.053020,1.812770,0.017245,0.164595,-0.823108,2.355810,-0.595753
915,0.369898,0.515671,-0.816550,-0.129442,-0.619830,-0.551041,0.017245,-0.856602,1.213585,0.293123,-0.595753
916,0.369898,-1.937107,0.228907,-0.129442,0.340090,-0.551041,-1.565856,1.460728,-0.823108,-0.831979,-0.595753


0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_norm, Y, test_size = 0.2, random_state=13)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.25, random_state=13)


if dataset_type == 'class':
    for Y_ in [Y_train, Y_val, Y_test]:
        count = Y_.value_counts()
        
        for i, j in zip(count.index, count):
            print(f"Class {i}: {j}")
        print()

Class 1: 294
Class 0: 256

Class 1: 113
Class 0: 71

Class 1: 101
Class 0: 83



In [49]:
# Проверка данных

print(X_train.isnull().sum())

print(X_val.isnull().sum())

print(X_test.isnull().sum())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64


In [52]:
from sklearn.linear_model import SGDRegressor # Регрессия
from sklearn.linear_model import LogisticRegression # Классификация
from sklearn.tree import DecisionTreeClassifier # Классификация при помощи решающего дерева
from sklearn.metrics import mean_absolute_error # Метрика для регрессии
from sklearn.metrics import accuracy_score #
from sklearn.metrics import f1_score

model = LogisticRegression()

model.fit(X_train, Y_train)

LogisticRegression()

In [53]:
print(accuracy_score(Y_train, model.predict(X_train)))
print(accuracy_score(Y_val, model.predict(X_val)))
print(accuracy_score(Y_test, model.predict(X_test)))

0.8527272727272728
0.8695652173913043
0.8532608695652174
