# Курсовая работа. Мегафон.

Необходимо построить алгоритм, который для каждой пары пользователь-услуга определит вероятность
подключения услуги.

- id - идентификатор абонента
- vas_id - подключаемая услуга
- buy_time - время покупки, представлено в формате timestamp, для работы с этим столбцом понадобится функция datetime.fromtimestamp из модуля datetime
- target - целевая переменная, где 1 означает подключение услуги, 0 - абонент не подключил услугу соответственно. 

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_validate
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from imblearn.pipeline import Pipeline as SM_Pipeline
from imblearn.pipeline import make_pipeline as SM_make_pipeline
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report, f1_score, confusion_matrix, make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from catboost import CatBoostClassifier, Pool

RANDOM_STATE = 21

In [2]:
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')    
    return df

In [3]:
def balance_df_by_target(df, target_name, method='over'):
    assert method in ['over', 'under', 'tomek', 'smote'], 'Неверный метод сэмплирования'
    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1
    if method == 'over':
        for i in range(disbalance_coeff):
            sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
            df = df.append(sample, ignore_index=True)
            
    elif method == 'under':
        df_ = df.copy()
        df = df_[df_[target_name] == minor_class_name]
        tmp = df_[df_[target_name] == major_class_name]
        df = df.append(tmp.iloc[
            np.random.randint(0, tmp.shape[0], target_counts[minor_class_name])
        ], ignore_index=True)

    elif method == 'tomek':
        from imblearn.under_sampling import TomekLinks
        tl = TomekLinks()
        X_tomek, y_tomek = tl.fit_sample(df.drop(columns=target_name), df[target_name])
        df = pd.concat([X_tomek, y_tomek], axis=1)
    
    elif method == 'smote':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(sampling_strategy=0.4)
        X_smote, y_smote = smote.fit_resample(df.drop(columns=target_name), df[target_name])
        df = pd.concat([X_smote, y_smote], axis=1)

    return df.sample(frac=1)

In [4]:
def get_features(X: pd.DataFrame):
    assert isinstance(X, pd.DataFrame)
    
    X_nunique = X.apply(lambda x: x.nunique(dropna=False))

    f_init = set(X_nunique.index.tolist())
    f_const = set(X_nunique[X_nunique == 1].index.tolist())
    f_numeric = (X.fillna(0).astype(int).sum() - X.fillna(0).sum()).abs()
    f_numeric = set(f_numeric[f_numeric > 0].index.tolist())
    f_other = f_init - (f_numeric | f_const)
    f_binary = set(X.loc[:, f_other].columns[(
        (X.loc[:, f_other].max() == 1) & \
        (X.loc[:, f_other].min() == 0) & \
        (X.loc[:, f_other].isnull().sum() == 0)
    )])
    f_other = f_other - f_binary
    f_categorical = set(X_nunique.loc[f_other][X_nunique.loc[f_other] <= 10].index.tolist())
    f_other = f_other - f_categorical
    f_numeric = f_numeric | f_other
    f_all = f_binary | f_categorical | f_numeric

    print('f_init:', len(f_init))
    print('f_const:', len(f_const))
    print('f_binary:', len(f_binary))
    print('f_categorical:', len(f_categorical))
    print('f_numeric:', len(f_numeric))
    
    assert(len(f_init) == len(f_const) + len(f_binary) + len(f_numeric) + len(f_categorical))

    return list(f_binary), list(f_categorical), list(f_numeric), list(f_all)

In [5]:
def run_grid_search(estimator, X, y, params_grid, cv, scoring='roc_auc'):
    gsc = GridSearchCV(estimator, params_grid, scoring=scoring, cv=cv, n_jobs=-1)

    gsc.fit(X, y)
    print("Best %s score: %.2f" % (scoring, gsc.best_score_))
    print()
    print("Best parameters set found on development set:")
    print()
    print(gsc.best_params_)
    print()
    print("Grid scores on development set:")
    print()

    for i, params in enumerate(gsc.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (gsc.cv_results_['mean_test_score'][i], gsc.cv_results_['std_test_score'][i] * 2, params))

    print()
    
    return gsc

## 1. Загрузка данных и снижение занимаемой памяти.

In [6]:
df_train = pd.read_csv('data_train.csv', index_col='Unnamed: 0')
df_train = reduce_mem_usage(df_train)
df_train.head()

Unnamed: 0,id,vas_id,buy_time,target
0,540968,8.0,1537131600,0.0
1,1454121,4.0,1531688400,0.0
2,2458816,1.0,1534107600,0.0
3,3535012,5.0,1535922000,0.0
4,1693214,1.0,1535922000,0.0


In [7]:
df_test = pd.read_csv('data_test.csv', index_col='Unnamed: 0')
df_test = reduce_mem_usage(df_test)
df_test.head()

Unnamed: 0,id,vas_id,buy_time
0,3130519,2.0,1548018000
1,2000860,4.0,1548018000
2,1099444,2.0,1546808400
3,1343255,5.0,1547413200
4,1277040,2.0,1546808400


In [8]:
df_train.shape, df_test.shape

((831653, 4), (71231, 3))

In [9]:
# df_big = pd.read_csv('features.csv', chunksize=100000, iterator=True, sep='\t')
# df_big

In [10]:
# df_list = []
# all_users = pd.concat([df_train, df_test]).id.unique()

# for dfb in df_big:
#     merge = dfb['id'].isin(all_users)
#     dfb_filter = dfb[merge]
#     df_list.append(dfb_filter)

In [11]:
# df_features = df_list[0]
# for number in range(1, len(df_list)):
#     df_features = pd.concat([df_features, df_list[number]], axis=0)

In [12]:
# df_features.shape

In [13]:
# df_features.to_csv('short_features.csv')
df_features = pd.read_csv('short_features.csv')

In [14]:
df_features.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

In [15]:
df_features

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,243,244,245,246,247,248,249,250,251,252
0,2046132,1534712400,300.820029,1599.480888,286.879214,1585.013202,281.461754,1563.90821,-16.08618,654.013903,...,-977.373846,-613.770792,-25.996269,-35.630448,-295.747724,-17.832889,-0.694428,-4.175933,-0.45614,0.0
1,2050810,1540760400,-86.209971,91.820888,-84.480786,110.333202,-89.898246,89.22821,-16.08618,-65.076097,...,-977.373846,-613.770792,-23.996269,190.369552,-286.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
2,2070757,1540760400,-96.799971,-408.179112,-110.740786,-460.786798,-114.038246,-479.77179,-16.08618,-65.076097,...,-925.373846,-561.770792,-21.996269,-37.630448,-151.747724,-24.832889,0.305572,-12.175933,-0.45614,1.0
3,2071522,1544994000,-94.939971,-363.699112,-108.880786,-411.226798,-114.298246,-432.33179,-16.08618,-65.076097,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
4,2075318,1533502800,-75.639971,669.690888,-89.580786,732.343202,-94.998246,736.65821,-16.08618,782.383903,...,-501.373846,-242.770792,-25.996269,-37.630448,-167.747724,-14.832889,2.305572,-4.175933,-0.45614,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902384,3513869,1548018000,82.370029,-123.429112,155.939214,-88.526798,150.521754,-109.63179,-16.08618,-65.076097,...,-928.373846,-570.770792,-23.996269,-37.630448,-271.747724,-22.832889,-0.694428,-12.175933,-0.45614,0.0
902385,3516552,1547413200,-96.799971,-116.519112,-110.740786,-169.126798,-116.158246,-190.23179,-16.08618,226.583903,...,-975.373846,-613.770792,-25.996269,-37.630448,-5.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0
902386,3517434,1548018000,-96.799971,-284.349112,-100.740786,-274.796798,-106.158246,-295.90179,-16.08618,-65.076097,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
902387,3519714,1546808400,167.280029,110.140888,153.339214,57.533202,147.921754,36.42821,-2.00618,-50.996097,...,-977.373846,-613.770792,-14.996269,7.369552,-180.747724,-19.832889,0.305572,-12.175933,-0.45614,0.0


In [16]:
df_features = reduce_mem_usage(df_features)

## 2. Соединение датасетов, генерация фичей

In [17]:
df_train['buy_time'] = df_train['buy_time'].apply(lambda x: datetime.fromtimestamp(x))
df_test['buy_time'] = df_test['buy_time'].apply(lambda x: datetime.fromtimestamp(x))
df_features['buy_time'] = df_features['buy_time'].apply(lambda x: datetime.fromtimestamp(x))

In [18]:
df_train = df_train.sort_values(by='buy_time')
df_test = df_test.sort_values(by='buy_time')
df_features = df_features.sort_values(by='buy_time')

In [19]:
# df_train.loc[df_train.id == 2101246]

In [20]:
# df_features.loc[df_features.id == 2101246]

In [21]:
# копируем признак - так как он затрется при merge_asof
df_features['buy_time_copy'] = df_features['buy_time']

In [22]:
train = pd.merge_asof(df_train, df_features, on="buy_time", by="id", direction='nearest')

In [23]:
test = pd.merge_asof(df_test, df_features, on="buy_time", by="id", direction='nearest')

In [24]:
del df_features, df_train, df_test

In [25]:
# добавим разницу в днях между предложением и покупкой (или непокупкой) как признак
train['diff'] = (train['buy_time'] - train['buy_time_copy']).dt.days
test['diff'] = (test['buy_time'] - test['buy_time_copy']).dt.days

In [26]:
# добавим день, месяц, год совершения покупки (или непокупки) как признак
train['year'] = train['buy_time'].map(lambda x: x.year)
train['month'] = train['buy_time'].map(lambda x: x.month)
train['day'] = train['buy_time'].map(lambda x: x.day)

In [27]:
test['year'] = test['buy_time'].map(lambda x: x.year)
test['month'] = test['buy_time'].map(lambda x: x.month)
test['day'] = test['buy_time'].map(lambda x: x.day)

In [28]:
# удалим колонки в формате datetime
train.drop(['buy_time_copy', 'buy_time'], axis=1, inplace=True)
test.drop(['buy_time_copy', 'buy_time'], axis=1, inplace=True)

In [29]:
train['vas_id'] = train['vas_id'].astype('int')
test['vas_id'] = test['vas_id'].astype('int')

In [30]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [31]:
# Сохранение датасетов
train.to_csv('train_processed.csv', index=False)
test.to_csv('test_processed.csv', index=False)

## 3. Оценка важности признков, baseline

In [32]:
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')

In [33]:
X = train.drop(columns='target')
y = train['target']

In [34]:
# Разобьем датасет на тренировочную и валидационную выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, stratify=y, random_state=RANDOM_STATE)

In [35]:
# посмотрим дисбаланс классов
y_train.value_counts()[1] / y_train.value_counts()[0]

0.07801461778762914

In [36]:
y_test.value_counts()[1] / y_test.value_counts()[0]

0.07801590044936052

In [37]:
# Делаем балансировку датасета по методу smote
TARGET_NAME = 'target'
train_balanced = balance_df_by_target(train, TARGET_NAME, method='smote')

In [38]:
X_bal = train_balanced.drop(columns='target')
y_bal = train_balanced['target']

In [39]:
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_bal, y_bal, shuffle=True, test_size=0.3, stratify=y_bal, random_state=RANDOM_STATE)

In [40]:
y_train_bal.value_counts()[1] / y_train_bal.value_counts()[0]

0.3999985185925889

In [41]:
X_train_pca = pd.DataFrame(MinMaxScaler().fit_transform(X_train_bal))

In [42]:
break_even = [0.8, 0.9, 0.95, 0.975, 0.99, 0.999]

pca = PCA(n_components=45)
X_train_pca = pca.fit_transform(X_train_pca)
var_vector = np.cumsum(pca.explained_variance_ratio_)

for level in break_even:
    temp_list = []
    for n in var_vector:
        if n<=level: temp_list.append(n)
    print(f'required number of components for {level*100}% of total VAR is {len(temp_list)}')

required number of components for 80.0% of total VAR is 15
required number of components for 90.0% of total VAR is 22
required number of components for 95.0% of total VAR is 28
required number of components for 97.5% of total VAR is 34
required number of components for 99.0% of total VAR is 41
required number of components for 99.9% of total VAR is 45


In [43]:
f_binary, f_categorical, f_numeric, f_all = get_features(X_train)

f_init: 259
f_const: 6
f_binary: 0
f_categorical: 2
f_numeric: 251


In [44]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame does not contain the following columns: %s" % cols_error)

- Базовая модель - Логистическая Регрессия.
- солвер SAGA, оптимизированный для больших датасетов
- в качестве метода масштабирования - MinMaxScaler
- метрика f1_score(macro)
- балансировка классов SMOTE библиотеки imblearn (oversampling)

In [45]:
base_pipeline = SM_make_pipeline(
    FeatureSelector(columns=f_all),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            FeatureSelector(columns=f_numeric),
            StandardScaler(),
        )),
        ("categorical_features", make_pipeline(
            FeatureSelector(columns=f_categorical),
            OneHotEncoder(handle_unknown='ignore')
        ))
    ]),
    SMOTE(sampling_strategy=0.4, n_jobs=-1),
    LogisticRegression(random_state=RANDOM_STATE, solver='liblinear'),
    verbose=True
)

In [46]:
base_pipeline.fit(X_train, y_train)

[Pipeline] ... (step 1 of 4) Processing featureselector, total=   0.1s
[Pipeline] ...... (step 2 of 4) Processing featureunion, total=   9.5s
[Pipeline] ............. (step 3 of 4) Processing smote, total=11.5min
[Pipeline]  (step 4 of 4) Processing logisticregression, total= 9.6min


Pipeline(steps=[('featureselector',
                 FeatureSelector(columns=['25', '72', '100', '117', '196',
                                          '195', 'diff', '215', '181', '227',
                                          '59', '20', '46', '234', '14', '175',
                                          '178', '53', '240', '74', '176',
                                          '209', '110', '93', '131', '18', '61',
                                          '242', '127', '96', ...])),
                ('featureunion',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('featureselector',
                                                                  FeatureSelec...
                                                                                           '96', ...])),
                                                                 ('standardscaler',
                                               

In [47]:
predict = base_pipeline.predict(X_test)
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

         0.0       0.97      0.91      0.94    231440
         1.0       0.38      0.70      0.49     18056

    accuracy                           0.90    249496
   macro avg       0.68      0.80      0.72    249496
weighted avg       0.93      0.90      0.91    249496



## 4. Обучение других моделей

Если добавляю снижение размерности - то вместе со временем просчета модели очень резко снижается macro avg.

In [48]:
pipe_lr_svd_l2 = SM_make_pipeline(
    FeatureSelector(columns=f_all),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            FeatureSelector(columns=f_numeric),
            StandardScaler(),
        )),
        ("categorical_features", make_pipeline(
            FeatureSelector(columns=f_categorical),
            OneHotEncoder(handle_unknown='ignore')
        ))
    ]),
    TruncatedSVD(n_components=45),
    SMOTE(sampling_strategy=0.4, n_jobs=-1),
    LogisticRegression(penalty='l2', solver='saga', random_state=RANDOM_STATE, 
                                  class_weight='balanced', n_jobs=-1),
    verbose=True
)

In [49]:
pipe_lr_svd_l2.fit(X_train, y_train)
predict_pca = pipe_lr_svd_l2.predict(X_test)
print(classification_report(y_test, predict_pca))

[Pipeline] ... (step 1 of 5) Processing featureselector, total=   0.1s
[Pipeline] ...... (step 2 of 5) Processing featureunion, total=   9.5s
[Pipeline] ...... (step 3 of 5) Processing truncatedsvd, total=  37.9s
[Pipeline] ............. (step 4 of 5) Processing smote, total=  25.7s




[Pipeline]  (step 5 of 5) Processing logisticregression, total=  48.5s
              precision    recall  f1-score   support

         0.0       0.94      0.54      0.68    231440
         1.0       0.08      0.53      0.14     18056

    accuracy                           0.54    249496
   macro avg       0.51      0.53      0.41    249496
weighted avg       0.87      0.54      0.64    249496



#### GradientBoostingClassifier

In [50]:
GB_pipeline = make_pipeline(
    FeatureSelector(columns=f_all),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            FeatureSelector(columns=f_numeric),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            FeatureSelector(columns=f_categorical),
            OneHotEncoder(handle_unknown='ignore')
        ))
    ]),
    GradientBoostingClassifier(random_state=RANDOM_STATE),
    verbose=True
)

In [51]:
%time
GB_pipeline.fit(X_train, y_train)

Wall time: 0 ns
[Pipeline] ... (step 1 of 3) Processing featureselector, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing featureunion, total=   9.5s
[Pipeline]  (step 3 of 3) Processing gradientboostingclassifier, total=20.2min


Pipeline(steps=[('featureselector',
                 FeatureSelector(columns=['25', '72', '100', '117', '196',
                                          '195', 'diff', '215', '181', '227',
                                          '59', '20', '46', '234', '14', '175',
                                          '178', '53', '240', '74', '176',
                                          '209', '110', '93', '131', '18', '61',
                                          '242', '127', '96', ...])),
                ('featureunion',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('featureselector',
                                                                  FeatureSelec...
                                                                                           '209',
                                                                                           '110',
                                        

In [52]:
predict = GB_pipeline.predict(X_test)
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

         0.0       0.94      0.99      0.97    231440
         1.0       0.69      0.25      0.37     18056

    accuracy                           0.94    249496
   macro avg       0.82      0.62      0.67    249496
weighted avg       0.93      0.94      0.92    249496



#### CatBoostClassifier

In [53]:
train_data = Pool(data=X_train,
                   label=y_train)

eval_dataset = Pool(X_test,
                     label=y_test)

cb_model = CatBoostClassifier(iterations=2500,
                           depth=10,
                           learning_rate=0.1,
                           loss_function='Logloss',
                           eval_metric='AUC',
                           early_stopping_rounds=50,
                           grow_policy='Depthwise',
                           random_state=42,
                           thread_count=4,
                           use_best_model=True,
                           verbose=True)

In [54]:
cb_model.fit(train_data, eval_set=eval_dataset, logging_level='Silent')

<catboost.core.CatBoostClassifier at 0x1a1e77552e0>

In [55]:
preds_class = cb_model.predict(X_test)
print(classification_report(y_test, preds_class))

              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97    231440
         1.0       0.66      0.28      0.40     18056

    accuracy                           0.94    249496
   macro avg       0.80      0.64      0.68    249496
weighted avg       0.93      0.94      0.93    249496



### Проведем подбор параметров для CatBoost, используем кросс-валидцию

In [56]:
kfold_cv = KFold(n_splits=3, shuffle=True, random_state=42)

In [57]:
model_catb = CatBoostClassifier(cat_features=['id', 'vas_id'],
                                      silent=True, random_state=42)

In [58]:
param_grid = {
    "learning_rate": [0.1],
    "depth": [3],
    "iterations": [300],
    'l2_leaf_reg': [5, 15, 25]
}

In [59]:
# %%time
# catb_gsc = run_grid_search(model_catb, X_train, y_train, param_grid, kfold_cv)

In [60]:
model_catb = CatBoostClassifier(n_estimators=200,
                                 max_depth=3, 
                                 l2_leaf_reg=15,
                                 learning_rate=0.1,
                                 cat_features=['id', 'vas_id'], 
                                 silent=True, 
                                 random_state=RANDOM_STATE)

In [61]:
%%time
model_catb.fit(X_train, y_train)

Wall time: 32.9 s


<catboost.core.CatBoostClassifier at 0x1a1e7010520>

In [62]:
%%time
catb_pred = model_catb.predict(X_test)

Wall time: 262 ms


In [63]:
print(classification_report(y_test, catb_pred))

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97    231440
         1.0       0.92      0.32      0.48     18056

    accuracy                           0.95    249496
   macro avg       0.93      0.66      0.73    249496
weighted avg       0.95      0.95      0.94    249496



### 5. Сохранение финальной модели.

In [64]:
with open('model.pickle', 'wb') as f:
    pickle.dump(model_catb, f, protocol=pickle.HIGHEST_PROTOCOL)

### 6. Предсказание на test выборке

In [65]:
with open('model.pickle', 'rb') as f:
    model_catb =  pickle.load(f)

In [66]:
# подготовленный тестовый датасет взят из пункта 2
test_preds = model_catb.predict(test)

In [67]:
df_test = pd.read_csv('data_test.csv', index_col='Unnamed: 0')
df_test = reduce_mem_usage(df_test)
df_test.head()

Unnamed: 0,id,vas_id,buy_time
0,3130519,2.0,1548018000
1,2000860,4.0,1548018000
2,1099444,2.0,1546808400
3,1343255,5.0,1547413200
4,1277040,2.0,1546808400


In [68]:
df_test['target'] = test_preds
df_test.head()

Unnamed: 0,id,vas_id,buy_time,target
0,3130519,2.0,1548018000,0.0
1,2000860,4.0,1548018000,0.0
2,1099444,2.0,1546808400,1.0
3,1343255,5.0,1547413200,0.0
4,1277040,2.0,1546808400,0.0


In [69]:
df_test.to_csv('answers_test.csv', index=False)