In [1701]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [1702]:
orders = pd.read_csv("data/orders.csv")
train = pd.read_csv("data/train_target.csv")
october = pd.read_csv("data/clients_promo_october.csv")
mobile = pd.read_csv("data/mobile_events.csv")
test = pd.read_csv("data/test.csv")
submit = pd.read_csv("data/submit.csv")

In [1703]:
orders['Date'] = pd.to_datetime(orders['Date'])

In [1704]:
# Установим текущую дату (для примера 2023-11-01)
current_date = pd.to_datetime('2023-11-01')

# Функция для создания признаков из дат заказов для одного пользователя
def create_features_for_user(df, current_date):
    features = {}
    ClientUUId = df['ClientUUId'].iloc[0]
    features['ClientUUId'] = ClientUUId

    # Количество заказов за последний месяц
    last_month_orders = df[df['Date'] >= current_date - pd.DateOffset(months=1)]
    features['num_orders_last_month'] = len(last_month_orders)

    # Средний интервал времени между заказами
    df = df.sort_values('Date')
    df['days_since_last_order'] = df['Date'].diff().dt.days
    features['mean_days_between_orders'] = df['days_since_last_order'].mean()

    # Время с момента последнего заказа
    last_order_date = df['Date'].max()
    features['days_since_last_order'] = (current_date - last_order_date).days

    # Время от первого до последнего заказа
    first_order_date = df['Date'].min()
    features['days_between_first_and_last_order'] = (last_order_date - first_order_date).days

    # Количество заказов в рабочие дни
    df['weekday'] = df['Date'].dt.weekday  # День недели (0 - понедельник, 6 - воскресенье)
    workdays_orders = df[df['weekday'].isin([0, 1, 2, 3, 4])]  # Рабочие дни (понедельник - пятница)
    features['num_orders_workdays'] = len(workdays_orders)

    # Время с момента последнего использования промокода
    if df['apply_promo'].sum() == 0:
        features['days_since_last_promo'] = 1000
    else:
        last_promo_date = df[df['apply_promo'] == 1]['Date'].max()
        features['days_since_last_promo'] = (current_date - last_promo_date).days

    # Промокоды за последний месяц
    last_month_promo_orders = last_month_orders[last_month_orders['apply_promo'] == 1]
    features['promo_used_last_month'] = int(len(last_month_promo_orders) > 0)

    return features

# Применение функции к каждому пользователю
user_groups = orders.groupby('ClientUUId')
features_list = [create_features_for_user(group, current_date) for name, group in user_groups]

# Создание DataFrame с признаками
features_df = pd.DataFrame(features_list)

features_df.to_csv("data/date_features.csv", index=False)

In [None]:
dates = pd.read_csv('data/date_features.csv')

In [1705]:
mobile = pd.get_dummies(mobile, columns=['EventName', 'Platform'], dtype=int, prefix='')

In [1706]:
# Группируем по пользователю и считаем сумму по каждому указанному столбцу
mobile = mobile.groupby('ClientUUId').agg(
    count=('VisitToken', 'count'),
    _add_to_cart_sum=('_add_to_cart', 'sum'),
    
    param1=('_apply_personal_offer', 'mean'),
    obm1=('_open_bonusaction', 'mean'),
    
    _apply_personal_offer_sum=('_apply_personal_offer', 'sum'),
    _close_app_sum=('_close_app', 'sum'),
    _create_order_sum=('_create_order', 'sum'),
    _open_app_sum=('_open_app', 'sum'),
    _open_bonusaction_sum=('_open_bonusaction', 'sum'),
    _open_product_card_sum=('_open_product_card', 'sum'),
    _remove_from_cart_sum=('_remove_from_cart', 'sum'),
    _screen_cart_sum=('_screen_cart', 'sum'),
    _screen_menu_sum=('_screen_menu', 'sum'),
    _screen_profile_sum=('_screen_profile', 'sum'),
    _android_sum=('_android', 'sum'),
    _ios_sum=('_ios', 'sum')
).reset_index()

In [1707]:
# колонки, которые ухудшают скор для некоторых моделей

xg_cols = []  
cat_cols = ['aaa', 'bbb']

In [1708]:
train['n_promos'] = train.groupby('ClientUUId')['LocalEndDate'].transform('count')
test['n_promos'] = train.groupby('ClientUUId')['LocalEndDate'].transform('count')

train['avg_OrderPrice'] = train.groupby('ClientUUId')['OrderPrice'].transform('mean')
train['min_OrderPrice'] = train.groupby('ClientUUId')['OrderPrice'].transform('min')
train['avg_disc'] = train.groupby('ClientUUId')['Discount'].transform('mean')
train['max_disc'] = train.groupby('ClientUUId')['Discount'].transform('max')


test['avg_OrderPrice'] = test.groupby('ClientUUId')['OrderPrice'].transform('mean')
test['min_OrderPrice'] = test.groupby('ClientUUId')['OrderPrice'].transform('min')
test['avg_disc'] = test.groupby('ClientUUId')['Discount'].transform('mean')
test['max_disc'] = test.groupby('ClientUUId')['Discount'].transform('max')


train['use_for_delivery'] = (train['OrderType'] == '1,2,3').astype(int)
train.drop(['OrderType'], axis=1, inplace=True)
test['use_for_delivery'] = (test['OrderType'] == '1,2,3').astype(int)
test.drop(['OrderType'], axis=1, inplace=True)


train['avg_del'] = train.groupby('ClientUUId')['use_for_delivery'].transform('mean')
train['max_del'] = train.groupby('ClientUUId')['use_for_delivery'].transform('max')
test['avg_del'] = test.groupby('ClientUUId')['use_for_delivery'].transform('mean')
test['max_del'] = test.groupby('ClientUUId')['use_for_delivery'].transform('max')


train['a'] = train.OrderPrice > train.min_OrderPrice
train['b'] = train.Discount < train.max_disc
train['c'] = train.use_for_delivery < train.max_del
test['a'] = test.OrderPrice > test.min_OrderPrice
test['b'] = test.Discount < test.max_disc
test['c'] = test.use_for_delivery < test.max_del

In [1709]:
orders[['price_with_discont', 'price']] = orders.groupby(['OrderUUId'])[['ProductTotalPrice', 'MenuPrice']].transform(sum)

In [1710]:
prices = orders.groupby(['ClientUUId'])['MenuPrice'].agg([
    'mean',
    'std'
]).reset_index()
cats = pd.get_dummies(orders[['ClientUUId', 'CategoryId']], columns=['CategoryId'], dtype=int).groupby(['ClientUUId']).mean()

In [1711]:
orders.OrderType = (orders.OrderType == 1).astype(int)
orders.OrderState = (orders.OrderState == 4).astype(int)

In [1712]:
orders['in_delivery'] = (orders.deliverySectorId == 0).astype(int)

In [1713]:
orders.drop(['ProductTotalPrice', 'MenuPrice', 'CategoryId', 'ProductUUId', 'addressId', 'deliverySectorId'], axis=1, inplace=True)

In [1714]:
orders.drop_duplicates(inplace=True)

In [1715]:
orders = pd.get_dummies(orders, columns=['OrderPaymentType'], prefix='pay')

orders['discont'] = orders.price - orders.price_with_discont

In [1716]:
orders = orders.groupby('ClientUUId').agg(
    avg_discont=('discont', 'mean'),
    sum_discont=('discont', 'sum'),
    aaa=('in_delivery', 'mean'),
    bbb=('in_delivery', 'sum'),
    apply_promo_mean1=('apply_promo', 'mean'),
    
    ClientOrderNumber_max=('ClientOrderNumber', 'max'),
    ClientOrderNumber_min=('ClientOrderNumber', 'min'),
    ClientOrderNumber_avg=('ClientOrderNumber', 'mean'),
    OrderState_mean=('OrderState', 'mean'),
    OrderType_mean=('OrderType', 'mean'),
    pay_0_count=('pay_0', 'sum'),
    pay_1_count=('pay_1', 'sum'),
    pay_2_count=('pay_2', 'sum'),
    OrderTotalPrice_mean=('OrderTotalPrice', 'mean'),
    OrderTotalPrice_std=('OrderTotalPrice', 'std'),
    OrderTotalPrice_sum=('OrderTotalPrice', 'sum'),
    price_mean=('price', 'mean'),
    price_sum=('price', 'sum'),
    apply_promo_mean=('apply_promo', 'mean'),
    apply_promo_sum=('apply_promo', 'sum'),
).reset_index()


In [1717]:
orders = pd.merge(pd.merge(orders, cats, on='ClientUUId'), prices, on='ClientUUId')

In [1718]:
train = train.rename(columns={'apply_promo': 'target'})
test = test.rename(columns={'apply_promo': 'target'})
test.head()

Unnamed: 0,ClientUUId,Id,LocalBeginDate,LocalEndDate,OrderPrice,Discount,n_promos,avg_OrderPrice,min_OrderPrice,avg_disc,max_disc,use_for_delivery,avg_del,max_del,a,b,c
0,000D3A20F23EA95811E7C0A95563344E,7,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,799,200,2,799.0,799,200.0,200,0,0.0,0,False,False,False
1,000D3A20F23EA95811E7C7892A0CE261,5,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,699,200,2,699.0,699,200.0,200,0,0.0,0,False,False,False
2,000D3A20F23EA95811E7CD686C396528,6,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,799,20,2,799.0,799,20.0,20,0,0.0,0,False,False,False
3,000D3A20F23EA95911E7CEA8C574EDAE,5,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,799,200,2,1024.0,799,200.0,200,0,0.5,1,False,False,True
4,000D3A20F23EA95911E7D4F05C59C978,7,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,799,200,2,799.0,799,200.0,200,0,0.0,0,False,False,False


In [1719]:
df = pd.merge(
    pd.merge(
        pd.merge(
            train, orders, on='ClientUUId', how='left'),
        mobile, on='ClientUUId', how='left'),
    dates.iloc[:, [0, 4, 5, 7]], on='ClientUUId', how='left')
df_test = pd.merge(
    pd.merge(
        pd.merge(
            test, orders, on='ClientUUId', how='left'), 
        mobile, on='ClientUUId', how='left'),
    dates.iloc[:, [0, 4, 5, 7]], on='ClientUUId', how='left')

In [1720]:
cat_cols.append('days_between_first_and_last_order')
cat_cols.append('num_orders_workdays')
xg_cols.append('promo_used_last_month')

In [1721]:
df = pd.get_dummies(df, columns=['Id'], dtype=int)
df_test = pd.get_dummies(df_test, columns=['Id'], dtype=int)

In [1722]:
df.drop(['LocalBeginDate', 'LocalEndDate', 'ClientUUId'], axis=1, inplace=True)
df_test.drop(['LocalBeginDate', 'LocalEndDate', 'ClientUUId'], axis=1, inplace=True)

In [1723]:
df = df.fillna(0)
df_test = df_test.fillna(0)

In [1724]:
X = df.drop(['target'], axis=1)
y = df['target']


In [1725]:
lin_cols = ['days_between_first_and_last_order',
                                         'num_orders_workdays',
                                         'promo_used_last_month']

In [1726]:
n_splits = 6
lin_models = []
test_scores = []
train_scores = []
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=52)

lin_X = X.drop(lin_cols, axis=1)
for train_index, test_index in kf.split(X=lin_X, y=y):
    X_train, X_test = lin_X.iloc[train_index], lin_X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    


    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    # Сделаем предсказания на тестовой выборке
    lin_test = lin_reg.predict(X_test)
    lin_train = lin_reg.predict(X_train)
    
    test_score = roc_auc_score(y_train, lin_train)
    train_score = roc_auc_score(y_test, lin_test)

    lin_models.append(lin_reg)
    test_scores.append(test_score)
    train_scores.append(train_score)


print("mean score --", np.mean(test_scores, dtype="float16"), np.std(test_scores).round(4))
print("mean score --", np.mean(train_scores, dtype="float16"), np.std(train_scores).round(4))
print("difference:", np.mean(train_scores, dtype="float16") - np.mean(test_scores, dtype="float16"))


ans = np.array([0]*df_test.shape[0])

    
for n, model in enumerate(lin_models):
    ans = ans + model.predict(df_test.drop(lin_cols, axis=1))
    
ans = pd.DataFrame(ans, columns = ['apply_promo'])
ans.to_csv('lin.csv', index=False)

mean score -- 0.7886 0.0028
mean score -- 0.779 0.0178
difference: -0.009766


In [1727]:
n_splits = 5
xg_models2 = []
test_scores = []
train_scores = []
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=52)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

xg_X = X.drop(xg_cols2, axis=1)

for train_index, test_index in kf.split(X=xg_X, y=y):
    X_train, X_test = xg_X.iloc[train_index], xg_X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.7, random_state=42)
    

    
    xgb = XGBClassifier(
        eval_metric="auc",
        n_estimators=100000,
        random_state=52, 
        verbosity=0, 
        scale_pos_weight=scale_pos_weight,
        # booster='dart',
        # eta = 0.04, 
        max_depth = 2, 
        gamma = 0.9,
        min_child_weight=13,
        # colsample_bylevel=0.5,
        # colsample_bytree=0.9,
        # colsample_bynode=0.35,
        subsample=0.9,
        reg_lambda=0.5,
        reg_alpha=0.5,
        early_stopping_rounds=40
    )
    xgb.fit(
        X_train, y_train, 
        eval_set=[[X_val, y_val]], 
        verbose=0
    ) 
    xg_models2.append([xgb, (0, xgb.best_iteration + 1)])

    xgb_test_preds = xgb.predict_proba(X_test,
                                  iteration_range=(0, xgb.best_iteration + 1))
    xgb_train_preds = xgb.predict_proba(X_train,
                                   iteration_range=(0, xgb.best_iteration + 1))
    
    test_score = roc_auc_score(y_test, xgb_test_preds[:, 1])
    train_score = roc_auc_score(y_train, xgb_train_preds[:, 1])
    test_scores.append(test_score)
    train_scores.append(train_score)

print("mean score --", np.mean(test_scores, dtype="float16"), np.std(test_scores).round(4))
print("mean score --", np.mean(train_scores, dtype="float16"), np.std(train_scores).round(4))
print("difference:", np.mean(train_scores, dtype="float16") - np.mean(test_scores, dtype="float16"))


ans = np.array([0]*df_test.shape[0])

for n, model in enumerate(xg_models2):
    ans = ans + model[0].predict_proba(df_test.drop(xg_cols2, axis=1), 
                                       iteration_range=model[1])[:, 1]
    
ans = pd.DataFrame(ans, columns = ['apply_promo'])
ans.to_csv('best.csv', index=False)

mean score -- 0.8594 0.0225
mean score -- 0.9307 0.0676
difference: 0.0713


In [1728]:
n_splits = 5
xg_models = []
test_scores = []
train_scores = []
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=52)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

xg_X = X.drop(xg_cols, axis=1)

for train_index, test_index in kf.split(X=xg_X, y=y):
    X_train, X_test = xg_X.iloc[train_index], xg_X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.7, random_state=42)
    

    
    xgb = XGBClassifier(
        eval_metric="auc",
        n_estimators=100000,
        random_state=52, 
        verbosity=0, 
        scale_pos_weight=scale_pos_weight,
        # booster='dart',
        # eta = 0.04, 
        max_depth = 3, 
        gamma = 0.9,
        min_child_weight=13,
        # colsample_bylevel=0.5,
        colsample_bytree=0.5,
        # colsample_bynode=0.25,
        subsample=0.9,
        reg_lambda=0.5,  #<-------------------------------------
        reg_alpha=0.5,
        early_stopping_rounds=40
    )
    xgb.fit(
        X_train, y_train, 
        eval_set=[[X_val, y_val]], 
        verbose=0
    ) 
    xg_models.append([xgb, (0, xgb.best_iteration + 1)])

    xgb_test_preds = xgb.predict_proba(X_test,
                                  iteration_range=(0, xgb.best_iteration + 1))
    xgb_train_preds = xgb.predict_proba(X_train,
                                   iteration_range=(0, xgb.best_iteration + 1))
    
    test_score = roc_auc_score(y_test, xgb_test_preds[:, 1])
    train_score = roc_auc_score(y_train, xgb_train_preds[:, 1])
    test_scores.append(test_score)
    train_scores.append(train_score)

print("mean score --", np.mean(test_scores, dtype="float16"), np.std(test_scores).round(4))
print("mean score --", np.mean(train_scores, dtype="float16"), np.std(train_scores).round(4))
print("difference:", np.mean(train_scores, dtype="float16") - np.mean(test_scores, dtype="float16"))


ans = np.array([0]*df_test.shape[0])

for n, model in enumerate(xg_models):
    ans = ans + model[0].predict_proba(df_test.drop(xg_cols, axis=1), 
                                       iteration_range=model[1])[:, 1]
    
ans = pd.DataFrame(ans, columns = ['apply_promo'])
ans.to_csv('xg.csv', index=False)

mean score -- 0.8813 0.0168
mean score -- 0.991 0.0062
difference: 0.10986


In [1729]:
n_splits = 5
cat_models = []
test_scores = []
train_scores = []
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=52)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

cat_X = X.drop(cat_cols, axis=1)


for train_index, test_index in kf.split(X=cat_X, y=y):

    
    X_train, X_test = cat_X.iloc[train_index], cat_X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.7, random_state=42)
    

    
    
    clf = CatBoostClassifier(iterations=2000,
                             thread_count=-1,
                             random_seed=42,
                             eta = 0.3,
                             eval_metric='AUC',
                             l2_leaf_reg=5,
                             min_data_in_leaf=15,
                             colsample_bylevel=0.5,
                             max_bin=20,
                             scale_pos_weight=scale_pos_weight,
                             subsample =0.8,
                             depth = 2
                            )

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        use_best_model=True,
        early_stopping_rounds=50,
    )

    cat_models.append(clf)
    clf_test_preds = clf.predict_proba(X_test)
    clf_train_preds = clf.predict_proba(X_train)
    
    test_score = roc_auc_score(y_test, [i[1] for i in clf_test_preds])
    train_score = roc_auc_score(y_train, [i[1] for i in clf_train_preds])
    test_scores.append(test_score)
    train_scores.append(train_score)
    # print('test score', test_score)
    # print('train score', train_score)
    # print()

print("mean score --", np.mean(test_scores, dtype="float16"), np.std(test_scores).round(4))
print("mean score --", np.mean(train_scores, dtype="float16"), np.std(train_scores).round(4))
print("difference:", np.mean(train_scores, dtype="float16") - np.mean(test_scores, dtype="float16"))


ans = np.array([0]*df_test.shape[0])


for n, model in enumerate(cat_models):
    ans = ans + model.predict_proba(df_test.drop(cat_cols, axis=1))[:, 1]
    
ans = pd.DataFrame(ans, columns = ['apply_promo'])
ans.to_csv('cat.csv', index=False)

mean score -- 0.8506 0.0103
mean score -- 0.947 0.0403
difference: 0.0962


In [1730]:
a = pd.read_csv('cat.csv')
c = pd.read_csv('xg.csv')
d = pd.read_csv('best.csv')
b = pd.read_csv('lin.csv')


ans = 1.2 * (a/a.mean()) + 1 * (b/b.mean()) + (c/c.mean()) + 1.8 * (d/d.mean())

ans.to_csv('new.csv', index=False)