In [3]:
# 01_create_datasets.py
from clearml import Task, Dataset
import pandas as pd
import numpy as np
import os
import shutil

# ======= 1. –°–æ–∑–¥–∞–µ–º Task –¥–ª—è —É–ø—Ä–∞–≤–ª–µ–Ω–∏—è –¥–∞—Ç–∞—Å–µ—Ç–∞–º–∏ =======
task = Task.init(
    project_name="Customer_Return_Prediction",
    task_name="Dataset Creation and Versioning",
    task_type=Task.TaskTypes.data_processing
)

# –õ–æ–≥–∏—Ä—É–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
task.set_parameter("dataset_versioning", {
    "raw_data_source": "transaction_.csv",
    "train_test_split": "temporal_split",
    "train_period": "until 2019-09-01",
    "test_period": "2019-09-01 to 2019-10-01"
})

# ======= 2. –í–ï–†–°–ò–Ø 1: –°—ã—Ä—ã–µ –¥–∞–Ω–Ω—ã–µ (–º–∏–Ω–∏–º–∞–ª—å–Ω–æ –æ—á–∏—â–µ–Ω–Ω—ã–µ) =======
print("üîπ –°–û–ó–î–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê V1: RAW DATA")

# –ó–∞–≥—Ä—É–∂–∞–µ–º –∏ –º–∏–Ω–∏–º–∞–ª—å–Ω–æ –æ—á–∏—â–∞–µ–º –¥–∞–Ω–Ω—ã–µ
transaction = pd.read_csv('transaction_.csv')
rename_dict = {'clientID': 'client', 'trDte': 'visit_date', 'itemGroup': 'item_group'}
transaction = transaction.rename(columns=rename_dict)
transaction['visit_date'] = pd.to_datetime(transaction['visit_date'], format='%d.%m.%Y')

# –°–æ—Ö—Ä–∞–Ω—è–µ–º –æ—á–∏—â–µ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ
os.makedirs('datasets', exist_ok=True)
transaction.to_csv('datasets/raw_cleaned.csv', index=False)

# –°–æ–∑–¥–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç V1
dataset_v1 = Dataset.create(
    dataset_name="customer_data_raw",
    dataset_project="Customer_Return_Prediction",
    dataset_tags=["v1.0", "raw", "cleaned"]
)

# –ò—Å–ø—Ä–∞–≤–ª–µ–Ω–æ: –ø–µ—Ä–µ–¥–∞–µ–º —Å—Ç—Ä–æ–∫—É, –∞ –Ω–µ —Å–ø–∏—Å–æ–∫
dataset_v1.add_files('datasets/raw_cleaned.csv')
dataset_v1.set_metadata({
    "description": "–ú–∏–Ω–∏–º–∞–ª—å–Ω–æ –æ—á–∏—â–µ–Ω–Ω—ã–µ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–æ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ",
    "processing_steps": ["loading", "column_renaming", "date_parsing"],
    "rows": len(transaction),
    "columns": list(transaction.columns),
    "date_range": f"{transaction['visit_date'].min().date()} to {transaction['visit_date'].max().date()}"
})

dataset_v1.upload()
dataset_v1.finalize()

print(f"‚úÖ Dataset V1 —Å–æ–∑–¥–∞–Ω: {dataset_v1.id}")
task.upload_artifact("raw_data_sample", transaction.head(100))

# ======= 3. –í–ï–†–°–ò–Ø 2: –û–±—É—á–∞—é—â–∏–µ –∏ —Ç–µ—Å—Ç–æ–≤—ã–µ –≤—ã–±–æ—Ä–∫–∏ =======
print("\nüîπ –°–û–ó–î–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê V2: TRAIN/TEST SPLITS")

# –ò—Å–ø–æ–ª—å–∑—É–µ–º –≤–∞—à—É –ª–æ–≥–∏–∫—É –¥–ª—è —Å–æ–∑–¥–∞–Ω–∏—è train/test
def calculate_client_profile_at_date(df, observation_end_date):
    """–í–∞—à–∞ —Ñ—É–Ω–∫—Ü–∏—è –∏–∑ –ø—Ä–µ–¥—ã–¥—É—â–µ–≥–æ –∫–æ–¥–∞"""
    obs_date = pd.to_datetime(observation_end_date)
    visits = df[df['visit_date'] < obs_date].copy()
    
    if visits.empty:
        return pd.DataFrame()
    
    visits['is_weekend'] = visits['visit_date'].dt.dayofweek >= 5
    
    agg = visits.groupby('client').agg(
        last_visit=('visit_date', 'max'),
        visits=('visit_date', 'count'),
        unique_visits=('visit_date', 'nunique'),
        amount=('amount', 'sum'),
        quantity=('quantity', 'sum'),
        items=('item', 'nunique'),
        weekends=('is_weekend', 'sum')
    ).reset_index()
    
    agg['Recency'] = (obs_date - agg['last_visit']).dt.days
    agg['Frequency'] = agg['unique_visits']
    agg['Monetary'] = agg['amount']
    agg['avg_check'] = agg['amount'] / agg['Frequency']
    agg['avg_items'] = agg['quantity'] / agg['Frequency']
    
    last_amount = visits.loc[visits.groupby('client')['visit_date'].idxmax()]
    agg = agg.merge(last_amount[['client', 'amount']].rename(
        columns={'amount': 'last_visit_amount'}), on='client')
    
    return agg

def mark_events(df, result_start_date, result_end_date):
    """–í–∞—à–∞ —Ñ—É–Ω–∫—Ü–∏—è —Ä–∞–∑–º–µ—Ç–∫–∏ —Å–æ–±—ã—Ç–∏–π"""
    start_date = pd.to_datetime(result_start_date)
    end_date = pd.to_datetime(result_end_date)
    
    base_clients = df['client'].unique()
    period_mask = (df['visit_date'] >= start_date) & (df['visit_date'] < end_date)
    active_clients = df.loc[period_mask, 'client'].unique()
    
    return pd.DataFrame({
        'client': base_clients,
        'event': [1 if client in active_clients else 0 for client in base_clients]
    })

# –°–æ–∑–¥–∞–µ–º train –¥–∞–Ω–Ω—ã–µ
train_profile = calculate_client_profile_at_date(transaction, '2019-09-01')
train_events = mark_events(transaction, '2019-09-01', '2019-10-01')
train_df = train_profile.merge(train_events, on='client', how='inner')
train_df['event'] = train_df['event'].astype(int)
train_df = train_df.drop(columns=[col for col in train_df.columns if 'date' in col.lower()], errors='ignore')

# –°–æ–∑–¥–∞–µ–º test –¥–∞–Ω–Ω—ã–µ
test_profile = calculate_client_profile_at_date(transaction, '2019-10-01')
test_events = mark_events(transaction, '2019-10-01', '2019-11-01')
test_df = test_profile.merge(test_events, on='client', how='inner')
test_df['event'] = test_df['event'].astype(int)
test_df = test_df.drop(columns=[col for col in test_df.columns if 'date' in col.lower()], errors='ignore')

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ñ–∞–π–ª—ã
train_df.to_csv('datasets/train_data_v2.csv', index=False)
test_df.to_csv('datasets/test_data_v2.csv', index=False)

# –°–æ–∑–¥–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç V2 —Å –Ω–∞—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ–º –æ—Ç V1
dataset_v2 = Dataset.create(
    dataset_name="customer_data_train_test",
    dataset_project="Customer_Return_Prediction",
    dataset_tags=["v2.0", "processed", "train_test_split"],
    parent_datasets=[dataset_v1.id]  # –ù–∞—Å–ª–µ–¥—É–µ–º –æ—Ç V1!
)

# –ò—Å–ø—Ä–∞–≤–ª–µ–Ω–æ: –¥–æ–±–∞–≤–ª—è–µ–º —Ñ–∞–π–ª—ã –ø–æ –æ–¥–Ω–æ–º—É
dataset_v2.add_files('datasets/train_data_v2.csv')
dataset_v2.add_files('datasets/test_data_v2.csv')

# –ò–ª–∏ –º–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å –≤—Å—é –ø–∞–ø–∫—É:
# dataset_v2.add_files('datasets/')

dataset_v2.set_metadata({
    "description": "–û–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ —Å train/test —Ä–∞–∑–±–∏–µ–Ω–∏–µ–º",
    "parent_dataset": dataset_v1.id,
    "train_samples": len(train_df),
    "test_samples": len(test_df),
    "features_count": len(train_df.columns) - 1,  # –±–µ–∑ target
    "target_distribution_train": {
        "returned": int(train_df['event'].sum()),
        "not_returned": int(len(train_df) - train_df['event'].sum()),
        "return_rate": float(train_df['event'].mean())
    },
    "target_distribution_test": {
        "returned": int(test_df['event'].sum()),
        "not_returned": int(len(test_df) - test_df['event'].sum()),
        "return_rate": float(test_df['event'].mean())
    }
})

dataset_v2.upload()
dataset_v2.finalize()

print(f"‚úÖ Dataset V2 —Å–æ–∑–¥–∞–Ω: {dataset_v2.id}")
print(f"   –ù–∞—Å–ª–µ–¥—É–µ—Ç –æ—Ç: {dataset_v1.id}")

# ======= 4. –õ–æ–≥–∏—Ä—É–µ–º —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –≤ Task =======
task.upload_artifact("train_data_sample", train_df.head(100))
task.upload_artifact("test_data_sample", test_df.head(100))

stats = {
    "datasets_created": {
        "v1_raw": dataset_v1.id,
        "v2_train_test": dataset_v2.id
    },
    "data_statistics": {
        "raw_rows": len(transaction),
        "train_rows": len(train_df),
        "test_rows": len(test_df),
        "train_features": list(train_df.columns),
        "train_target_balance": f"{train_df['event'].mean():.1%}",
        "test_target_balance": f"{test_df['event'].mean():.1%}"
    }
}

task.upload_artifact("dataset_statistics", stats)

print("\n" + "="*60)
print("‚úÖ –í–°–ï –î–ê–¢–ê–°–ï–¢–´ –°–û–ó–î–ê–ù–´ –ò –í–ï–†–°–ò–û–ù–ò–†–û–í–ê–ù–´!")
print("="*60)
print(f"V1 (Raw):      {dataset_v1.id}")
print(f"V2 (Train/Test): {dataset_v2.id}")
print(f"Lineage: V1 ‚Üí V2")

task.close()

üîπ –°–û–ó–î–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê V1: RAW DATA
ClearML results page: https://app.clear.ml/projects/c6b27df2c1d343f5b8ded148b93dfe5e/experiments/039ed9d78d9e49b5a7efbd604c8ce0b5/output/log
ClearML dataset page: https://app.clear.ml/datasets/simple/c6b27df2c1d343f5b8ded148b93dfe5e/experiments/039ed9d78d9e49b5a7efbd604c8ce0b5
Displaying metadata in the UI is only supported for pandas Dataframes for now. Skipping!
Uploading dataset changes (1 files compressed to 9.9 MiB) to https://files.clear.ml


‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 100% | 9.90/9.9 MB [00:02<00:00,  3.84MB/s]: 


File compression and upload completed: total size 9.9 MiB, 1 chunk(s) stored (average size 9.9 MiB)
‚úÖ Dataset V1 —Å–æ–∑–¥–∞–Ω: 039ed9d78d9e49b5a7efbd604c8ce0b5

üîπ –°–û–ó–î–ê–ù–ò–ï –î–ê–¢–ê–°–ï–¢–ê V2: TRAIN/TEST SPLITS
ClearML results page: https://app.clear.ml/projects/6c22464c761a410b84f5e312a13b6663/experiments/3520fae4f5044740872128c239c09ff9/output/log
ClearML dataset page: https://app.clear.ml/datasets/simple/6c22464c761a410b84f5e312a13b6663/experiments/3520fae4f5044740872128c239c09ff9
Displaying metadata in the UI is only supported for pandas Dataframes for now. Skipping!
Uploading dataset changes (2 files compressed to 3.86 MiB) to https://files.clear.ml
File compression and upload completed: total size 3.86 MiB, 1 chunk(s) stored (average size 3.86 MiB)
‚úÖ Dataset V2 —Å–æ–∑–¥–∞–Ω: 3520fae4f5044740872128c239c09ff9
   –ù–∞—Å–ª–µ–¥—É–µ—Ç –æ—Ç: 039ed9d78d9e49b5a7efbd604c8ce0b5

‚úÖ –í–°–ï –î–ê–¢–ê–°–ï–¢–´ –°–û–ó–î–ê–ù–´ –ò –í–ï–†–°–ò–û–ù–ò–†–û–í–ê–ù–´!
V1 (Raw):      039ed9d

In [4]:
# 02_logistic_regression_experiment.py
from clearml import Task, Dataset
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import pickle
import joblib
import os

# ======= 1. –°–æ–∑–¥–∞–µ–º Task –¥–ª—è —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞ =======
task = Task.init(
    project_name="Customer_Return_Prediction",
    task_name="Logistic Regression Experiment",
    task_type=Task.TaskTypes.training
)

# ======= 2. –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç V2 —á–µ—Ä–µ–∑ ClearML =======
print("üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ ClearML Dataset...")
dataset_v2 = Dataset.get(
    dataset_name="customer_data_train_test",
    dataset_project="Customer_Return_Prediction",
    dataset_tags=["v2.0", "processed", "train_test_split"],
    only_completed=True
)

# –°–∫–∞—á–∏–≤–∞–µ–º –¥–∞–Ω–Ω—ã–µ –ª–æ–∫–∞–ª—å–Ω–æ
local_path = dataset_v2.get_local_copy()
print(f"–î–∞–Ω–Ω—ã–µ —Å–∫–∞—á–∞–Ω—ã –≤: {local_path}")

# –ó–∞–≥—Ä—É–∂–∞–µ–º train/test —Ñ–∞–π–ª—ã
train_path = os.path.join(local_path, 'train_data_v2.csv')
test_path = os.path.join(local_path, 'test_data_v2.csv')

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(f"Train data: {train_df.shape}")
print(f"Test data: {test_df.shape}")

# ======= 3. –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö =======
X_train = train_df.drop(columns=['client', 'event'], errors='ignore')
y_train = train_df['event']
X_test = test_df.drop(columns=['client', 'event'], errors='ignore')
y_test = test_df['event']

# –û—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —á–∏—Å–ª–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

# –°–∏–Ω—Ö—Ä–æ–Ω–∏–∑–∞—Ü–∏—è –∫–æ–ª–æ–Ω–æ–∫
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# ======= 4. –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã –º–æ–¥–µ–ª–∏ =======
hyperparameters = {
    "model_type": "LogisticRegression",
    "penalty": "l2",           # L2 —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏—è
    "C": 1.0,                  # –û–±—Ä–∞—Ç–Ω–∞—è —Å–∏–ª–∞ —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏
    "solver": "lbfgs",         # –ê–ª–≥–æ—Ä–∏—Ç–º –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏
    "max_iter": 1000,          # –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∏—Ç–µ—Ä–∞—Ü–∏–π
    "random_state": 42,
    "class_weight": None,      # –í–µ—Å–∞ –∫–ª–∞—Å—Å–æ–≤
    "fit_intercept": True      # –î–æ–±–∞–≤–ª—è—Ç—å intercept
}

# –õ–æ–≥–∏—Ä—É–µ–º –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã –≤ ClearML (3.4)
task.connect(hyperparameters)
print("‚úÖ –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω—ã –≤ ClearML")

# ======= 5. –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ =======
print("\nüîß –û–±—É—á–µ–Ω–∏–µ Logistic Regression...")
model = LogisticRegression(
    penalty=hyperparameters["penalty"],
    C=hyperparameters["C"],
    solver=hyperparameters["solver"],
    max_iter=hyperparameters["max_iter"],
    random_state=hyperparameters["random_state"],
    class_weight=hyperparameters["class_weight"],
    fit_intercept=hyperparameters["fit_intercept"]
)

model.fit(X_train, y_train)

# ======= 6. –ü—Ä–æ–≥–Ω–æ–∑–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ –æ—Ü–µ–Ω–∫–∞ =======
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# –í—ã—á–∏—Å–ª—è–µ–º –º–µ—Ç—Ä–∏–∫–∏
roc_auc = roc_auc_score(y_test, y_pred_proba)
accuracy = (y_pred == y_test).mean()
precision = ((y_pred == 1) & (y_test == 1)).sum() / max((y_pred == 1).sum(), 1)
recall = ((y_pred == 1) & (y_test == 1)).sum() / max(y_test.sum(), 1)
f1 = 2 * precision * recall / max(precision + recall, 1e-10)

# –õ–æ–≥–∏—Ä—É–µ–º –º–µ—Ç—Ä–∏–∫–∏ –≤ ClearML (3.5)
task.get_logger().report_scalar(
    title="Model Metrics",
    series="ROC-AUC",
    value=roc_auc,
    iteration=0
)

task.get_logger().report_scalar(
    title="Model Metrics",
    series="Accuracy",
    value=accuracy,
    iteration=0
)

task.get_logger().report_scalar(
    title="Model Metrics", 
    series="F1-Score",
    value=f1,
    iteration=0
)

print(f"\nüìä –ú–ï–¢–†–ò–ö–ò –ú–û–î–ï–õ–ò:")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# ======= 7. –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è =======
# ROC-–∫—Ä–∏–≤–∞—è
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Model', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
task.get_logger().report_matplotlib_figure(
    title="ROC Curve",
    series="Logistic Regression",
    figure=plt.gcf(),
    iteration=0
)
plt.close()

# –ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
task.get_logger().report_matplotlib_figure(
    title="Confusion Matrix",
    series="Logistic Regression", 
    figure=plt.gcf(),
    iteration=0
)
plt.close()

# ======= 8. –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –∫–∞–∫ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç–∞ (3.3) =======
print("\nüíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
os.makedirs('models', exist_ok=True)
model_filename = 'models/logistic_regression_model.pkl'

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —á–µ—Ä–µ–∑ joblib (–ª—É—á—à–µ –¥–ª—è sklearn –º–æ–¥–µ–ª–µ–π)
joblib.dump(model, model_filename)
print(f"–ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞: {model_filename}")

# –ó–∞–≥—Ä—É–∂–∞–µ–º –∫–∞–∫ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç –≤ ClearML
task.upload_artifact(
    name="logistic_regression_model",
    artifact_object=model_filename,
    metadata={
        "model_type": "LogisticRegression",
        "hyperparameters": hyperparameters,
        "metrics": {
            "roc_auc": roc_auc,
            "accuracy": accuracy,
            "f1_score": f1
        },
        "dataset_version": dataset_v2.id
    }
)

# –¢–∞–∫–∂–µ —Å–æ—Ö—Ä–∞–Ω—è–µ–º —á–µ—Ä–µ–∑ pickle –¥–ª—è –ø–æ–ª–Ω–æ—Ç—ã
with open('models/logistic_regression_model_pickle.pkl', 'wb') as f:
    pickle.dump(model, f)

# ======= 9. –í–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ =======
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': abs(model.coef_[0])
}).sort_values('coefficient', ascending=False)

task.upload_artifact("feature_importance", feature_importance)

# ======= 10. –ò—Ç–æ–≥–æ–≤–∞—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è =======
print("\n" + "="*60)
print("‚úÖ –≠–ö–°–ü–ï–†–ò–ú–ï–ù–¢ –ó–ê–í–ï–†–®–ï–ù!")
print("="*60)
print(f"Task ID: {task.id}")
print(f"Dataset used: {dataset_v2.id}")
print(f"Model saved: {model_filename}")
print(f"ROC-AUC: {roc_auc:.4f}")

task.close()

Can't get url information for git repo in C:\Users\Ksenia\AppData\Roaming\Python\Python311\site-packages


ClearML Task: created new task id=58f1a07bd9f74a22881b7e2417b1469b


Can't get branch information for git repo in C:\Users\Ksenia\AppData\Roaming\Python\Python311\site-packages
Can't get commit information for git repo in C:\Users\Ksenia\AppData\Roaming\Python\Python311\site-packages
Can't get diff information for git repo in C:\Users\Ksenia\AppData\Roaming\Python\Python311\site-packages


ClearML results page: https://app.clear.ml/projects/ebbc67863e744623a4f3e7490e757670/experiments/58f1a07bd9f74a22881b7e2417b1469b/output/log
üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ ClearML Dataset...
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring



clamping frac to range [0, 1]

‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 100% | 9.90/9.9 MB [00:01<00:00,  9.46MB/s]: 


–î–∞–Ω–Ω—ã–µ —Å–∫–∞—á–∞–Ω—ã –≤: C:/Users/Ksenia/.clearml/cache/storage_manager/datasets/ds_3520fae4f5044740872128c239c09ff9
Train data: (39906, 15)
Test data: (41196, 15)
‚úÖ –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω—ã –≤ ClearML

üîß –û–±—É—á–µ–Ω–∏–µ Logistic Regression...



lbfgs failed to converge after 1000 iteration(s) (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




üìä –ú–ï–¢–†–ò–ö–ò –ú–û–î–ï–õ–ò:
ROC-AUC: 0.7980
Accuracy: 0.8274
Precision: 0.6385
Recall: 0.1966
F1-Score: 0.3006

üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...
–ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞: models/logistic_regression_model.pkl

‚úÖ –≠–ö–°–ü–ï–†–ò–ú–ï–ù–¢ –ó–ê–í–ï–†–®–ï–ù!
Task ID: 58f1a07bd9f74a22881b7e2417b1469b
Dataset used: 3520fae4f5044740872128c239c09ff9
Model saved: models/logistic_regression_model.pkl
ROC-AUC: 0.7980


In [None]:
# 03_xgboost_experiment.py
from clearml import Task, Dataset
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import joblib
import os

# ======= 1. –°–æ–∑–¥–∞–µ–º Task –¥–ª—è —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞ =======
task = Task.init(
    project_name="Customer_Return_Prediction", 
    task_name="XGBoost Experiment",
    task_type=Task.TaskTypes.training
)

# ======= 2. –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç V2 =======
print("üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ ClearML Dataset...")
dataset_v2 = Dataset.get(
    dataset_name="customer_data_train_test",
    dataset_project="Customer_Return_Prediction",
    dataset_tags=["v2.0", "processed", "train_test_split"],
    only_completed=True
)

local_path = dataset_v2.get_local_copy()
train_path = os.path.join(local_path, 'train_data_v2.csv')
test_path = os.path.join(local_path, 'test_data_v2.csv')

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# ======= 3. –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö =======
X_train = train_df.drop(columns=['client', 'event'], errors='ignore')
y_train = train_df['event']
X_test = test_df.drop(columns=['client', 'event'], errors='ignore')
y_test = test_df['event']

X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# ======= 4. –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã XGBoost =======
hyperparameters = {
    "model_type": "XGBClassifier",
    "n_estimators": 150,           # –£–≤–µ–ª–∏—á–∏–ª–∏ —Å 100 (–≤ –ø—Ä–æ—à–ª–æ–º –∫–æ–¥–µ)
    "max_depth": 5,                # –£–≤–µ–ª–∏—á–∏–ª–∏ —Å 3
    "learning_rate": 0.05,         # –£–º–µ–Ω—å—à–∏–ª–∏ —Å 0.1
    "subsample": 0.8,              # –î–æ–±–∞–≤–∏–ª–∏ subsample
    "colsample_bytree": 0.8,       # –î–æ–±–∞–≤–∏–ª–∏ colsample
    "reg_alpha": 0.1,              # L1 —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏—è
    "reg_lambda": 1.0,             # L2 —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏—è
    "random_state": 42,
    "eval_metric": "logloss",
    "use_label_encoder": False
}

# –õ–æ–≥–∏—Ä—É–µ–º –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã (3.4)
task.connect(hyperparameters)
print("‚úÖ –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã XGBoost –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω—ã")

# ======= 5. –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ =======
print("\nüîß –û–±—É—á–µ–Ω–∏–µ XGBoost...")
model = XGBClassifier(
    n_estimators=hyperparameters["n_estimators"],
    max_depth=hyperparameters["max_depth"],
    learning_rate=hyperparameters["learning_rate"],
    subsample=hyperparameters["subsample"],
    colsample_bytree=hyperparameters["colsample_bytree"],
    reg_alpha=hyperparameters["reg_alpha"],
    reg_lambda=hyperparameters["reg_lambda"],
    random_state=hyperparameters["random_state"],
    eval_metric=hyperparameters["eval_metric"],
    use_label_encoder=hyperparameters["use_label_encoder"]
)

model.fit(X_train, y_train)

# ======= 6. –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏ =======
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

roc_auc = roc_auc_score(y_test, y_pred_proba)
accuracy = (y_pred == y_test).mean()

# –õ–æ–≥–∏—Ä—É–µ–º –º–µ—Ç—Ä–∏–∫–∏ (3.5)
task.get_logger().report_scalar(
    title="Model Metrics",
    series="ROC-AUC",
    value=roc_auc,
    iteration=0
)

task.get_logger().report_scalar(
    title="Model Metrics",
    series="Accuracy", 
    value=accuracy,
    iteration=0
)

print(f"\nüìä –ú–ï–¢–†–ò–ö–ò XGBoost:")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# ======= 7. –°—Ä–∞–≤–Ω–µ–Ω–∏–µ —Å Logistic Regression =======
# –ú–æ–∂–Ω–æ –∑–∞–≥—Ä—É–∑–∏—Ç—å –º–µ—Ç—Ä–∏–∫–∏ –∏–∑ –ø—Ä–µ–¥—ã–¥—É—â–µ–≥–æ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
print("\nüîç –°–†–ê–í–ù–ï–ù–ò–ï –° LOGISTIC REGRESSION:")
print("(–í —Ä–µ–∞–ª—å–Ω–æ–º —Å—Ü–µ–Ω–∞—Ä–∏–∏ –Ω—É–∂–Ω–æ –∑–∞–≥—Ä—É–∑–∏—Ç—å –º–µ—Ç—Ä–∏–∫–∏ –∏–∑ –ø—Ä–µ–¥—ã–¥—É—â–µ–≥–æ Task)")
print("XGBoost –æ–±—ã—á–Ω–æ –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç –ª—É—á—à–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –Ω–∞ —Å–ª–æ–∂–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö")

# ======= 8. –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è =======
# ROC-–∫—Ä–∏–≤–∞—è
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'XGBoost (AUC = {roc_auc:.3f})', linewidth=2, color='green')
plt.plot([0, 1], [0, 1], 'k--', label='Random Model', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - XGBoost')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
task.get_logger().report_matplotlib_figure(
    title="ROC Curve",
    series="XGBoost",
    figure=plt.gcf(),
    iteration=0
)
plt.close()

# –í–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
plt.barh(importance_df['feature'][:15], importance_df['importance'][:15])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importance - XGBoost')
plt.gca().invert_yaxis()
task.get_logger().report_matplotlib_figure(
    title="Feature Importance",
    series="XGBoost",
    figure=plt.gcf(),
    iteration=0
)
plt.close()

# ======= 9. –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ (3.3) =======
print("\nüíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ XGBoost –º–æ–¥–µ–ª–∏...")
model_filename = 'models/xgboost_model.pkl'
joblib.dump(model, model_filename)

task.upload_artifact(
    name="xgboost_model",
    artifact_object=model_filename,
    metadata={
        "model_type": "XGBClassifier",
        "hyperparameters": hyperparameters,
        "metrics": {
            "roc_auc": roc_auc,
            "accuracy": accuracy
        },
        "dataset_version": dataset_v2.id,
        "top_features": importance_df['feature'].head(5).tolist()
    }
)

# ======= 10. –í—ã–≤–æ–¥—ã –¥–ª—è –ø–æ—è—Å–Ω–∏—Ç–µ–ª—å–Ω–æ–π –∑–∞–ø–∏—Å–∫–∏ =======
print("\n" + "="*60)
print("üìù –î–õ–Ø –ü–û–Ø–°–ù–ò–¢–ï–õ–¨–ù–û–ô –ó–ê–ü–ò–°–ö–ò:")
print("="*60)
print("\n–ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã, –∫–æ—Ç–æ—Ä—ã–µ –≤–∞—Ä—å–∏—Ä–æ–≤–∞–ª–∏—Å—å:")
print("1. n_estimators: 100 ‚Üí 150 (—É–≤–µ–ª–∏—á–µ–Ω–∏–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –¥–µ—Ä–µ–≤—å–µ–≤)")
print("2. max_depth: 3 ‚Üí 5 (—É–≤–µ–ª–∏—á–µ–Ω–∏–µ –≥–ª—É–±–∏–Ω—ã –¥–µ—Ä–µ–≤—å–µ–≤)")
print("3. learning_rate: 0.1 ‚Üí 0.05 (—É–º–µ–Ω—å—à–µ–Ω–∏–µ —Å–∫–æ—Ä–æ—Å—Ç–∏ –æ–±—É—á–µ–Ω–∏—è)")
print("4. –î–æ–±–∞–≤–ª–µ–Ω—ã —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏: reg_alpha, reg_lambda")
print("5. –î–æ–±–∞–≤–ª–µ–Ω—ã: subsample, colsample_bytree")

print("\n–ö–∞–∫ –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–æ –≤ ClearML:")
print("- –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω Task.connect() –¥–ª—è –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤")
print("- –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –≤–∏–¥–Ω—ã –≤–æ –≤–∫–ª–∞–¥–∫–µ Configuration")
print("- –ö–∞–∂–¥—ã–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç –∏–º–µ–µ—Ç —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –Ω–∞–±–æ—Ä –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤")

print("\n–°—Ä–∞–≤–Ω–∏—Ç–µ–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑ (–Ω—É–∂–Ω–æ –≤—ã–ø–æ–ª–Ω–∏—Ç—å –æ–±–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞):")
print("1. Logistic Regression: –ø—Ä–æ—â–µ, –∏–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∏—Ä—É–µ–º, –±—ã—Å—Ç—Ä–µ–µ")
print("2. XGBoost: —Å–ª–æ–∂–Ω–µ–µ, –æ–±—ã—á–Ω–æ –ª—É—á—à–µ —Ç–æ—á–Ω–æ—Å—Ç—å, –º–µ–¥–ª–µ–Ω–Ω–µ–µ –æ–±—É—á–µ–Ω–∏–µ")
print("3. –°—Ä–∞–≤–Ω–∏—Ç—å ROC-AUC –∏–∑ –æ–±–æ–∏—Ö —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–æ–≤")
print("4. –ü—Ä–æ–∞–Ω–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å –≤–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ –æ–±–µ–∏—Ö –º–æ–¥–µ–ª—è—Ö")

print("\n" + "="*60)
print("‚úÖ –≠–ö–°–ü–ï–†–ò–ú–ï–ù–¢ XGBoost –ó–ê–í–ï–†–®–ï–ù!")
print("="*60)

task.close()

Can't get url information for git repo in C:\Users\Ksenia\AppData\Roaming\Python\Python311\site-packages


ClearML Task: created new task id=eb2fabf954da4fd1bac6f2017e55231f


Can't get branch information for git repo in C:\Users\Ksenia\AppData\Roaming\Python\Python311\site-packages
Can't get commit information for git repo in C:\Users\Ksenia\AppData\Roaming\Python\Python311\site-packages
Can't get diff information for git repo in C:\Users\Ksenia\AppData\Roaming\Python\Python311\site-packages


ClearML results page: https://app.clear.ml/projects/ebbc67863e744623a4f3e7490e757670/experiments/eb2fabf954da4fd1bac6f2017e55231f/output/log
üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ ClearML Dataset...
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring
‚úÖ –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã XGBoost –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω—ã

üîß –û–±—É—á–µ–Ω–∏–µ XGBoost...



Parameters: { "use_label_encoder" } are not used.





üìä –ú–ï–¢–†–ò–ö–ò XGBoost:
ROC-AUC: 0.8075
Accuracy: 0.8337

üîç –°–†–ê–í–ù–ï–ù–ò–ï –° LOGISTIC REGRESSION:
(–í —Ä–µ–∞–ª—å–Ω–æ–º —Å—Ü–µ–Ω–∞—Ä–∏–∏ –Ω—É–∂–Ω–æ –∑–∞–≥—Ä—É–∑–∏—Ç—å –º–µ—Ç—Ä–∏–∫–∏ –∏–∑ –ø—Ä–µ–¥—ã–¥—É—â–µ–≥–æ Task)
XGBoost –æ–±—ã—á–Ω–æ –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç –ª—É—á—à–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –Ω–∞ —Å–ª–æ–∂–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö



No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.




üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ XGBoost –º–æ–¥–µ–ª–∏...

üìù –î–õ–Ø –ü–û–Ø–°–ù–ò–¢–ï–õ–¨–ù–û–ô –ó–ê–ü–ò–°–ö–ò:

–ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã, –∫–æ—Ç–æ—Ä—ã–µ –≤–∞—Ä—å–∏—Ä–æ–≤–∞–ª–∏—Å—å:
1. n_estimators: 100 ‚Üí 150 (—É–≤–µ–ª–∏—á–µ–Ω–∏–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –¥–µ—Ä–µ–≤—å–µ–≤)
2. max_depth: 3 ‚Üí 5 (—É–≤–µ–ª–∏—á–µ–Ω–∏–µ –≥–ª—É–±–∏–Ω—ã –¥–µ—Ä–µ–≤—å–µ–≤)
3. learning_rate: 0.1 ‚Üí 0.05 (—É–º–µ–Ω—å—à–µ–Ω–∏–µ —Å–∫–æ—Ä–æ—Å—Ç–∏ –æ–±—É—á–µ–Ω–∏—è)
4. –î–æ–±–∞–≤–ª–µ–Ω—ã —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏: reg_alpha, reg_lambda
5. –î–æ–±–∞–≤–ª–µ–Ω—ã: subsample, colsample_bytree

–ö–∞–∫ –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–æ –≤ ClearML:
- –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω Task.connect() –¥–ª—è –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
- –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –≤–∏–¥–Ω—ã –≤–æ –≤–∫–ª–∞–¥–∫–µ Configuration
- –ö–∞–∂–¥—ã–π —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç –∏–º–µ–µ—Ç —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –Ω–∞–±–æ—Ä –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤

–°—Ä–∞–≤–Ω–∏—Ç–µ–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑ (–Ω—É–∂–Ω–æ –≤—ã–ø–æ–ª–Ω–∏—Ç—å –æ–±–∞ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞):
1. Logistic Regress

Retrying (Retry(total=237, connect=237, read=240, redirect=240, status=240)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000014A86C80DD0>: Failed to establish a new connection: [WinError 10065] –°–¥–µ–ª–∞–Ω–∞ –ø–æ–ø—ã—Ç–∫–∞ –≤—ã–ø–æ–ª–Ω–∏—Ç—å –æ–ø–µ—Ä–∞—Ü–∏—é –Ω–∞ —Å–æ–∫–µ—Ç–µ –¥–ª—è –Ω–µ–¥–æ—Å—Ç—É–ø–Ω–æ–≥–æ —Ö–æ—Å—Ç–∞')': /v2.23/tasks.ping
Retrying (Retry(total=237, connect=237, read=240, redirect=240, status=240)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000014AE1499050>: Failed to resolve 'api.clear.ml' ([Errno 11001] getaddrinfo failed)")': /v2.23/tasks.ping
Retrying (Retry(total=236, connect=236, read=240, redirect=240, status=240)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000014A82B7A810>: Failed to resolve 'api.clear.ml' ([Errno 11001] getaddrinfo failed)")': /v2.23/tasks.ping
Retrying (Retry(total=235, connect=235