In [13]:
import pandas as pd
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from sklearn.metrics import roc_auc_score, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from geopy.distance import geodesic

In [4]:
test = pd.read_excel(r'Z:\DATASETS\otp\test.xls')
train = pd.read_excel(r'Z:\DATASETS\otp\train.xls')
y_train = train['TARGET']
y_test = pd.read_excel(r'Z:\DATASETS\otp\submission.xls')
test = pd.concat([test, y_test], axis=1)

In [None]:
task1 = Task('binary', metric='auc') # Задача - бинарная классификация (10 лабораторная)

automl1 = TabularAutoML( # Настройки LightAutoML для логистической регрессии
    task=task1,
    timeout=300,  # 300 секунд
    cpu_limit=4, # у меня от 4 до 8 ядер
    general_params={'use_algos': [['linear_l2']]},  # Используем только линейные модели
    reader_params={'n_jobs': 4, 'cv': 3, 'random_state': 42}
)

roles = {'target': 'TARGET'} # Определение ролей переменных

oof_pred = automl1.fit_predict( # Обучение модели (правильный вызов fit_predict)
    train_data=train,
    roles=roles,
    verbose=1
)

X_test = test.drop('TARGET', axis=1) # Подготовка  данных (без целевой переменной)
y_test = test['TARGET']

test_pred = automl1.predict(X_test) # Предсказание на тестовых данных

auc_score = roc_auc_score(y_test, test_pred.data[:, 0]) # Оценка модели
print(f"\nTest AUC score: {auc_score:.4f}")

[17:43:10] Stdout logging level is INFO.
[17:43:10] Task: binary

[17:43:10] Start automl preset with listed constraints:
[17:43:10] - time: 300.00 seconds
[17:43:10] - CPU: 4 cores
[17:43:10] - memory: 16 GB

[17:43:10] [1mTrain data shape: (15223, 52)[0m

[17:43:11] Layer [1m1[0m train process start. Time left 299.07 secs
[17:43:12] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[17:43:15] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.6887462690424304[0m
[17:43:15] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[17:43:15] Time left 295.17 secs

[17:43:15] [1mLayer 1 training completed.[0m

[17:43:15] [1mAutoml preset training completed in 4.84 seconds[0m

[17:43:15] Model description:
Final prediction for new objects (level 0) = 
	 1.00000 * (3 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) 


Test AUC score: 0.6951


In [14]:
# Для задачи регресии по недвижимости 

df2 = pd.read_csv(r'Z:\DATASETS\all_v2.csv')

df2 = df2[(df2['region'] == 2661)] # Единственное условие

train, test = train_test_split(df2, test_size=0.2, random_state=42)

train['date_difference'] = (pd.to_datetime(train['date'].max()) - pd.to_datetime(train['date'])).dt.days
test['date_difference'] = (pd.to_datetime(test['date'].max()) - pd.to_datetime(test['date'])).dt.days

target_point = (59.938962, 30.315586)  # Центр СПБ

def calculate_distance(row, target):
    point = (row['geo_lat'], row['geo_lon'])
    return geodesic(point, target).kilometers

train['distance_to_the_center_km'] = train.apply(calculate_distance, target=target_point, axis=1)

def calculate_distance(row, target):
    point = (row['geo_lat'], row['geo_lon'])
    return geodesic(point, target).kilometers

test['distance_to_the_center_km'] = test.apply(calculate_distance, target=target_point, axis=1)

# Определение задачи как регрессии
task2 = Task('reg', metric='r2')  # Можно также использовать 'mae', 'mse', 'rmse'

automl = TabularAutoML( # Настройки LightAutoML для регрессии
    task=task2,
    timeout=600,  # 10 минут на обучение
    cpu_limit=4, # 
    general_params={'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},  # Линейные модели + LightGBM
    reader_params={'n_jobs': 4, 'cv': 5, 'random_state': 42}
)

roles = {'target': 'price', 'drop': []}  # Определение ролей переменных

oof_pred = automl.fit_predict( # Обучение модели
    train_data=train,
    roles=roles,
    verbose=1
)

X_test = test.drop('price', axis=1) # Подготовка тестовых данных
y_test = test['price']

# Предсказание на тестовых данных
test_pred = automl.predict(X_test)

# Оценка модели
r2 = r2_score(y_test, test_pred.data)
mape = mean_absolute_percentage_error(y_test, test_pred.data)

print(f"\nR2 score: {r2:.4f}")
print(f"MAPE: {mape:.4f}")

[18:11:45] Stdout logging level is INFO.
[18:11:45] Task: reg

[18:11:45] Start automl preset with listed constraints:
[18:11:45] - time: 600.00 seconds
[18:11:45] - CPU: 4 cores
[18:11:45] - memory: 16 GB

[18:11:45] [1mTrain data shape: (369456, 15)[0m



  self.data[i] = pd.to_datetime(
  self.data[i] = pd.to_datetime(


[18:12:26] Layer [1m1[0m train process start. Time left 559.02 secs
[18:12:45] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[18:12:50] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m-0.30991482734680176[0m
[18:12:50] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[18:12:50] Time left 535.07 secs

[18:12:54] [1mSelector_LightGBM[0m fitting and predicting completed
[18:13:21] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[18:14:12] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.44658273458480835[0m
[18:14:12] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[18:14:12] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 94.17 secs


Optimization Progress:  10%|▉         | 10/101 [01:45<15:58, 10.53s/it, best_trial=8, best_value=0.447]

[18:15:57] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[18:15:57] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...





[18:16:43] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m0.45220017433166504[0m
[18:16:43] [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[18:16:43] Time left 302.22 secs

[18:16:43] [1mLayer 1 training completed.[0m

[18:16:43] Blending: optimization starts with equal weights. Score = [1m0.3690357[0m
[18:16:43] Blending: iteration [1m0[0m: score = [1m0.4533717[0m, weights = [1m[0.         0.29349294 0.706507  ][0m
[18:16:43] Blending: no improvements for score. Terminated.

[18:16:43] Blending: best score = [1m0.4533717[0m, best weights = [1m[0.         0.29349294 0.706507  ][0m
[18:16:43] [1mAutoml preset training completed in 298.49 seconds[0m

[18:16:43] Model description:
Final prediction for new objects (level 0) = 
	 0.29349 * (5 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
	 0.70651 * (5 averaged models Lvl_0_Pipe_1_Mod_1_Tuned_LightGBM) 



  self.data[i] = pd.to_datetime(



R2 score: 0.8502
MAPE: 0.5209
