# Код для реализации основной части

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import uniform
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
import time
import shap

### Подготовка данных

In [None]:
df = pd.read_csv('road_pixels_with_brightness.csv')

In [None]:
# Отсечение частей изображения
df = df[~(
    ((df['y'] >= 800) & (df['x'] >= 0) & (df['x'] <= 500)) |
    ((df['y'] >= 800) & (df['x'] >= 800) & (df['x'] <= 1500)) |
    ((df['y'] >= 1000) & (df['x'] >= 1500) & (df['x'] <= 1700)) |
    ((df['y'] >= 600) & (df['x'] >= 1000) & (df['x'] <= 1300)) |
    ((df['y'] >= 0) & (df['y'] <= 400) & (df['x'] >= 0) & (df['x'] <= 600)) |
    ((df['y'] >= 0) & (df['y'] <= 600) & (df['x'] >= 0) & (df['x'] <= 200)) |
    ((df['y'] >= 0) & (df['y'] <= 400) & (df['x'] >= 1500)) |
    ((df['y'] >= 0) & (df['y'] <= 600) & (df['x'] >= 1500)) |
    ((df['y'] >= 0) & (df['y'] <= 800) & (df['x'] >= 1800)) |
    ((df['y'] >= 0) & (df['y'] <= 200) & (df['x'] >= 0))
)]

In [None]:
# Проверка распределения классов
class_counts = df['class'].value_counts()
print(class_counts)
class_ratios = class_counts / len(df)
print(class_ratios)

In [None]:
# Визуализация классов
plt.figure(figsize=(10, 6))
plt.scatter(df[df['class'] == 0]['x'], df[df['class'] == 0]['y'], 
            c='gray', s=1, label='Класс 0')
plt.scatter(df[df['class'] == 1]['x'], df[df['class'] == 1]['y'], 
            c='red', s=10, label='Класс 1')
plt.xlabel('X координата')
plt.ylabel('Y координата')
plt.title('Визуализация пикселей по классам')
plt.gca().invert_yaxis()
plt.show()

In [None]:
X = df[['x', 'y', 'R', 'G', 'B', 'brightness']]
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=88, stratify=y)

In [None]:
print("Распределение классов в тренировочной выборке:")
print(y_train.value_counts(normalize=True))
print("Распределение классов в тестовой выборке:")
print(y_test.value_counts(normalize=True))

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Построение моделей и оценка качества

In [None]:
# Логистическая регрессия с GridSearch
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300],
}
log_reg = LogisticRegression(class_weight='balanced', random_state=88)
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=3)
grid_search.fit(X_train, y_train)
print("Лучшие гиперпараметры: ", grid_search.best_params_)
log_reg = LogisticRegression(C=0.1, max_iter=200, solver='saga', 
                             class_weight='balanced', random_state=88)
start_time = time.time()
log_reg.fit(X_train, y_train)
training_time = time.time() - start_time
start_time = time.time()
y_pred_log_reg = log_reg.predict(X_test)
prediction_time = time.time() - start_time
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
roc_auc_log_reg = roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1])
print(f"Accuracy (Logistic Regression): {accuracy_log_reg}")
print(f"ROC AUC (Logistic Regression): {roc_auc_log_reg}")
print(f"Training Time: {training_time:.4f}")
print(f"Prediction Time: {prediction_time:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix_log_reg, 
    annot=True, 
    fmt='d', 
    cmap='Blues', 
    xticklabels=['Class 0', 'Class 1'], 
    yticklabels=['Class 0', 'Class 1']
)
plt.title('Confusion Matrix Logistic Regression')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
X_train_df = pd.DataFrame(X_train, columns=['x', 'y', 'R', 'G', 'B', 'brightness'])
y_train_df = pd.DataFrame(y_train, columns=['class'])
X_train_df.reset_index(drop=True, inplace=True)
y_train_df.reset_index(drop=True, inplace=True)
train_df = pd.concat([X_train_df, y_train_df], axis=1)
X_test_df = pd.DataFrame(X_test, columns=['x', 'y', 'R', 'G', 'B', 'brightness'])
y_test_df = pd.DataFrame(y_pred_log_reg, columns=['class'])
X_test_df.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)
test_df = pd.concat([X_test_df, y_test_df], axis=1)

In [None]:
plt.figure(figsize=(18, 12))
plt.scatter(train_df['x'], train_df['y'], c='gray', 
            s=0.01, label='Класс 0 (тренировочные данные) СЕРЫЙ')  
plt.scatter(train_df[train_df['class'] == 1]['x'], train_df[train_df['class'] == 1]['y'], 
            c='black', s=0.01, label='Класс 1 (тренировочные данные) КРАСНЫЙ')  
plt.scatter(test_df['x'], test_df['y'], c='blue', 
            s=0.01, label='Класс 0 (предсказание на тест) СИНИЙ')
plt.scatter(test_df[test_df['class'] == 1]['x'], test_df[test_df['class'] == 1]['y'], 
            c='red', s=0.05, label='Класс 1 (предсказание на тест) ЧЕРНЫЙ')
plt.title('Визуализация пикселей с выделением классов')
plt.xlabel('x (стандартизованные)')
plt.ylabel('y (стандартизованные)')
plt.gca().invert_yaxis()
plt.legend(loc='upper right')
plt.show()

In [None]:
# Метод опорных векторов с RandomizedSearchCV
param_dist = {
    'C': uniform(loc=0.1, scale=100),
    'gamma': ['scale', 'auto', 0.1, 1, 10], 
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'degree': [2, 3, 4],
    'coef0': [0.0, 0.5, 1.0], 
    'max_iter': [100, 1000, 5000],
}
svm = SVC(random_state=88)
random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist,
    n_iter=50,
    cv=3,
    scoring='accuracy',
    verbose=3,
    random_state=88
)
random_search.fit(X_train, y_train)
print("Лучшие гиперпараметры: ", random_search.best_params_)

In [None]:
best_params = {
    'C': 85.09869730830853,
    'coef0': 0.0,
    'degree': 2,
    'gamma': 10,
    'kernel': 'rbf',
    'max_iter': 5000
}
best_svm = SVC(
    C=best_params['C'],
    coef0=best_params['coef0'],
    degree=best_params['degree'],
    gamma=best_params['gamma'],
    kernel=best_params['kernel'],
    max_iter=best_params['max_iter'],
    random_state=88,
    class_weight='balanced'
)
start_train = time.time()
best_svm.fit(X_train, y_train)
end_train = time.time()
start_pred = time.time()
y_pred_svm = best_svm.predict(X_test)
decision_scores = best_svm.decision_function(X_test)
end_pred = time.time()
train_time = end_train - start_train
pred_time = end_pred - start_pred
print(f"Training Time: {train_time:.4f}")
print(f"Prediction Time: {pred_time:.4f}")
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy SVM: {accuracy_svm}")
roc_auc_svm = roc_auc_score(y_test, decision_scores)
print(f"ROC AUC SVM: {roc_auc_svm}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))
conf_matrix = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.title('Confusion Matrix SVM')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()
# код для визуализации пикселей аналогичен коду выше 

In [None]:
# Метод k-ближайших соседей
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],        
    'weights': ['uniform', 'distance'],    
    'metric': ['euclidean', 'manhattan'],
}
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, 
                           verbose=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(f"Лучшие параметры: {grid_search.best_params_}")

In [None]:
best_params = {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
knn_best = KNeighborsClassifier(
    n_neighbors=best_params['n_neighbors'],
    weights=best_params['weights'],
    metric=best_params['metric']
)
start_time = time.time()
knn_best.fit(X_train, y_train)
train_time = time.time() - start_time
start_time = time.time()
y_pred_knn = knn_best.predict(X_test)
predict_time = time.time() - start_time
print(f"Training Time: {train_time}")
print(f"Prediction Time: {predict_time}")
accuracy_knn = accuracy_score(y_test, y_pred_knn)
roc_auc_knn = roc_auc_score(y_test, knn_best.predict_proba(X_test)[:, 1])
print(f"KNN Accuracy: {accuracy_knn}")
print(f"KNN ROC-AUC: {roc_auc_knn}")
print("Classification Report for KNN:")
print(classification_report(y_test, y_pred_knn))
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix_knn, 
    annot=True, 
    fmt='d', 
    cmap='Blues', 
    xticklabels=['Class 0', 'Class 1'], 
    yticklabels=['Class 0', 'Class 1']
)
plt.title('Confusion Matrix KNN')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

### Подведение итогов

In [None]:
# оценка важности факторов
X_test_sample = shap.sample(X_test, 1000)
explainer = shap.KernelExplainer(knn.predict, X_test_sample)
shap_values = explainer.shap_values(X_test_sample)
shap.summary_plot(shap_values, X_test_sample, feature_names=X.columns.tolist())

In [None]:
df_1 = pd.read_csv('road_pixels_with_brightness1.csv')
df_2 = pd.read_csv('road_pixels_with_brightness2.csv')
X_train = df_1[['x', 'y', 'R', 'G', 'B', 'brightness']]  
y_train = df_1['class']
X_test = df_2[['x', 'y', 'R', 'G', 'B', 'brightness']]  
y_test = df_2['class']  

In [None]:
# оптимизация гиперпараметров аналогична тому, что выше
best_params = {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
knn = KNeighborsClassifier(
    n_neighbors=best_params['n_neighbors'],
    weights=best_params['weights'],
    metric=best_params['metric']
)
start_time = time.time()
knn.fit(X_train, y_train)
train_time = time.time() - start_time
print(f"Training Time: {train_time:.4f}")
start_time = time.time()
y_pred = knn.predict(X_test)
predict_time = time.time() - start_time
print(f"Prediction Time: {predict_time:.4f}")
accuracy = accuracy_score(y_test, y_pred)
print(f"Точность модели: {accuracy}")
y_prob = knn.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC AUC: {roc_auc}")
print("Отчет классификации:")
print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Предсказанный класс')
plt.ylabel('Истинный класс')
plt.title('Матрица ошибок')
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
plt.scatter(test_df['x'], test_df['y'], c='blue', s=0.01, 
            label='Класс 0 (предсказанные на тест) СИНИЙ')
plt.scatter(test_df[test_df['class'] == 1]['x'], test_df[test_df['class'] == 1]['y'], 
            c='red', s=0.05, label='Класс 1 (предсказанные на тест) КРАСНЫЙ')
plt.xlabel('x (стандартизованные)')
plt.ylabel('y (стандартизованные)')
plt.gca().invert_yaxis()
plt.legend(loc='upper right')
plt.show()