In [39]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from pycaret.classification import *

# Загрузка датасета Iris
iris = load_iris()
# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, train_size=0.75, test_size=0.25)

print(X_train[:10])

[[6.1 2.9 4.7 1.4]
 [6.3 3.3 4.7 1.6]
 [6.4 3.1 5.5 1.8]
 [6.2 2.8 4.8 1.8]
 [5.4 3.4 1.7 0.2]
 [6.4 2.9 4.3 1.3]
 [6.5 3.  5.8 2.2]
 [6.4 3.2 5.3 2.3]
 [5.6 2.8 4.9 2. ]
 [6.8 3.2 5.9 2.3]]


In [40]:
# Применение PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)
print(X_pca[:10])

[[ 0.7345861  -0.11991047]
 [ 0.84664868  0.28430494]
 [ 1.65515425  0.0464808 ]
 [ 1.00725954 -0.18178441]
 [-2.55644205  0.41838941]
 [ 0.46718697  0.15541629]
 [ 2.09891067 -0.05012851]
 [ 1.65368807  0.10776434]
 [ 0.94326739 -0.61086413]
 [ 2.31402988  0.26566481]]


In [41]:
# Создание нового датасета с использованием новых признаков
new_data = np.concatenate((X_pca, y_train.reshape(-1, 1)), axis=1)
new_data = pd.DataFrame(new_data, columns=['feature1', 'feature2', 'target'])

In [42]:
# Создание и обучение модели с помощью PyCaret
clf = setup(data=new_data, target='target')

Unnamed: 0,Description,Value
0,Session id,2901
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(112, 3)"
4,Transformed data shape,"(112, 3)"
5,Transformed train set shape,"(78, 3)"
6,Transformed test set shape,"(34, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


In [43]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.95,0.9858,0.95,0.95,0.95,0.9238,0.9238,0.082
lr,Logistic Regression,0.9482,0.9904,0.9482,0.9549,0.9465,0.9203,0.925,0.173
et,Extra Trees Classifier,0.9375,0.9904,0.9375,0.9542,0.9361,0.9052,0.9151,0.028
knn,K Neighbors Classifier,0.9357,0.9789,0.9357,0.9455,0.9336,0.9012,0.908,0.01
qda,Quadratic Discriminant Analysis,0.9357,0.9954,0.9357,0.9518,0.9321,0.9008,0.9116,0.004
lda,Linear Discriminant Analysis,0.9357,0.9954,0.9357,0.9455,0.9336,0.9012,0.908,0.005
rf,Random Forest Classifier,0.925,0.9846,0.925,0.9312,0.9243,0.8857,0.8897,0.029
nb,Naive Bayes,0.8839,0.979,0.8839,0.9101,0.8817,0.8242,0.8392,0.005
dt,Decision Tree Classifier,0.8839,0.9117,0.8839,0.9086,0.8808,0.8259,0.8399,0.004
svm,SVM - Linear Kernel,0.8732,0.0,0.8732,0.8726,0.8541,0.8069,0.8323,0.009


In [44]:
# Прогнозирование на новых данных
new_X_pca = pca.transform(X_test)
new_data = pd.DataFrame(np.concatenate((new_X_pca, y_test.reshape(-1, 1)), axis=1),
                        columns=['feature1', 'feature2', 'target'])
predictions = predict_model(best_model, data=new_data)

print(predictions)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9737,0.9962,0.9737,0.9756,0.9732,0.9576,0.9587


    feature1  feature2  target  prediction_label  prediction_score
0  -2.834881  0.538983     0.0                 0            0.9971
1   0.103754 -0.497286     1.0                 1            0.9921
2   0.562283 -0.159978     1.0                 1            0.9493
3   1.160491 -0.578199     2.0                 2            0.9408
4   0.974483  0.409830     1.0                 1            0.6125
5   0.389005 -0.410934     1.0                 1            0.9952
6   0.557099  0.195389     1.0                 1            0.9753
7   1.700075  0.039079     2.0                 2            0.9905
8  -2.841472  1.119448     0.0                 0            0.9971
9   0.108390 -0.061989     1.0                 1            0.9943
10 -0.556774 -0.355286     1.0                 1            0.9991
11  0.682306  0.471210     1.0                 1            0.9678
12 -2.964343 -0.148855     0.0                 0            0.9994
13 -0.960827 -0.998023     1.0                 1            0.