# Базовое решение для задачи B

Необходимо предсказать объем трат по каждой из 184 категорий на каждый день следующего месяца. Итоговый файл должен содержать предсказания по 184 * 30 = 5520 объектам. Объем трат в конкретной категории считается как сумма всех расходных транзакций в текущей категории по всем пользователям.

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression

Читаем входные файлы с данными

In [2]:
transactions = pd.read_csv('transactions.csv')

Берём расходные транзакции и формируем тестовую выборку

In [5]:
train_transactions = transactions[transactions.amount < 0].copy()
train_transactions['day'] = train_transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

In [6]:
test_transactions = pd.DataFrame(columns = train_transactions.mcc_code.unique(), 
                                 index = np.arange(1, 31) + train_transactions.day.max())
test_transactions = test_transactions.unstack().reset_index().dropna(axis = 1)
test_transactions.columns = ['mcc_code', 'day']

In [7]:
train_grid = pd.DataFrame(columns = train_transactions.mcc_code.unique(), 
                          index = train_transactions.day.unique())
train_grid = train_grid.unstack().reset_index().dropna(axis = 1)
train_grid.columns = ['mcc_code', 'day']

Добавляем признаки, относящиеся к дате.

In [8]:
for tr_table in [train_transactions, test_transactions, train_grid]:
    tr_table['week_num'] = tr_table['day'] // 7
    tr_table['week_day'] = tr_table['day'] % 7
    tr_table['month_num'] = tr_table['day'] // 30
    tr_table['month_day'] = tr_table['day'] % 30

In [13]:
tr_table.head()

Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day
0,4814,0,0,0,0,0
1,4814,1,0,1,0,1
2,4814,2,0,2,0,2
3,4814,3,0,3,0,3
4,4814,4,0,4,0,4


In [9]:
train_transactions = (pd.merge(train_grid, 
                             train_transactions.groupby(['day', 'week_num', 'week_day', 'month_num', 
                                                        'month_day', 'mcc_code'])[['amount']]
                             .sum().reset_index(), how = 'left')
                      .fillna(0))

Для каждого mcc_code получим данные о сумме транзакий за каждый день. Используем для обучения только временные признаки.

In [10]:
for day_shift in [-1, 0, 1]:
    for month_shift in train_transactions.month_num.unique()[1:]:
        train_shift = train_transactions.copy()
        train_shift['month_num'] += month_shift
        train_shift['month_day'] += day_shift
        train_shift['amount_day_{}_{}'.format(day_shift, month_shift)] = np.log(-train_shift['amount'] + 1)
        train_shift = train_shift[['month_num', 'month_day', 'mcc_code', 'amount_day_{}_{}'.format(day_shift, month_shift)]]

        train_transactions = pd.merge(train_transactions, train_shift, 
                                      on = ['month_num', 'month_day', 'mcc_code'], how = 'left').fillna(0)
        test_transactions = pd.merge(test_transactions, train_shift, 
                                     on = ['month_num', 'month_day', 'mcc_code'], how = 'left').fillna(0)

In [14]:
train_transactions.head()

Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,amount,amount_day_-1_1,amount_day_-1_2,amount_day_-1_3,...,amount_day_1_6,amount_day_1_7,amount_day_1_8,amount_day_1_9,amount_day_1_10,amount_day_1_11,amount_day_1_12,amount_day_1_13,amount_day_1_14,amount_day_1_15
0,4814,0,0,0,0,0,-11098744.26,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4814,1,0,1,0,1,-7881825.53,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4814,2,0,2,0,2,-6777480.45,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4814,3,0,3,0,3,-9277943.73,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4814,4,0,4,0,4,-9999757.21,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Обучаемся на всех данных

In [11]:
shift = 500
train = pd.get_dummies(train_transactions, columns = ['mcc_code'])
test = pd.get_dummies(test_transactions, columns = ['mcc_code'])
c = train.columns.difference(['amount'])

clf = LinearRegression()
clf.fit(train[c], np.log(-train['amount'] + shift))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Предсказываем объём трат для тестовых данных и создаём файл с ответом

In [12]:
test_transactions['volume'] = np.e ** clf.predict(test[c]) - shift
test_transactions[['mcc_code', 'day', 'volume']].to_csv('baseline_b.csv', index=False)