In [59]:
import pandas as pd
import numpy as np
import scipy.sparse as spsp
import scipy.sparse.linalg as spsplin

## Загрузить транзакции

In [36]:
transactions_file = "hackathon_data/avk_hackathon_data_transactions.csv"
useful_cols = ['party_rk', 'transaction_dttm', 'transaction_amt_rur', 'merchant_group_rk', 'category']
transactions = pd.read_csv(transactions_file, usecols=useful_cols, parse_dates=['transaction_dttm'])
print(transactions.shape)
transactions.head()

(11987617, 5)


Unnamed: 0,party_rk,transaction_dttm,transaction_amt_rur,merchant_group_rk,category
0,20337,2019-01-01,84.0,,Сувениры
1,63404,2019-01-01,410.0,725.0,Фаст Фуд
2,24789,2019-01-01,701.44,,Супермаркеты
3,57970,2019-01-01,6203.7,454.0,Дом/Ремонт
4,12232,2019-01-01,734.53,878.0,Супермаркеты


## Загрузить данные о возрасте

In [37]:
socdem_file = "hackathon_data/avk_hackathon_data_party_x_socdem.csv"
useful_cols = ['party_rk', 'age']
socdem = pd.read_csv(socdem_file, usecols=useful_cols)
print(socdem.shape)
socdem.head()

(50000, 2)


Unnamed: 0,party_rk,age
0,61243,70.0
1,66535,25.0
2,83721,55.0
3,88238,35.0
4,57179,30.0


## Объединить таблицы и выбрать людей в возрасте 60+

In [42]:
df = pd.merge(transactions, socdem, on='party_rk').dropna(subset=['merchant_group_rk'])
df.drop(df[df.age < 60].index, inplace=True)
df.drop(columns='age', inplace=True)
print(df.shape)
df.head()

(344202, 5)


Unnamed: 0,party_rk,transaction_dttm,transaction_amt_rur,merchant_group_rk,category
19767,29815,2019-01-01,757.03,341.0,Супермаркеты
19775,29815,2019-01-01,314.0,341.0,Супермаркеты
19779,29815,2019-01-10,1470.61,341.0,Супермаркеты
19784,29815,2019-01-15,53.9,878.0,Супермаркеты
19792,29815,2019-01-17,251.0,589.0,Связь/Телеком


## Просуммировать затраты за месяц для каждой сети магазинов

In [46]:
monthly = pd.Grouper(key='transaction_dttm',freq='M')
df_monthly = df.groupby([monthly, 'party_rk', 'merchant_group_rk', 'category']).agg({'transaction_amt_rur': 'sum'})
print(df_monthly.shape)
df_monthly.head()

(133367, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,transaction_amt_rur
transaction_dttm,party_rk,merchant_group_rk,category,Unnamed: 4_level_1
2019-01-31,8,205.0,Красота,3935.0
2019-01-31,30,243.0,Финансовые услуги,5100.0
2019-01-31,30,999.0,Супермаркеты,520.25
2019-01-31,30,1211.0,Аптеки,1257.0
2019-01-31,30,2259.0,Финансовые услуги,11.0


## Найти средние месячные затраты для каждой сети магазинов

In [56]:
df_monthly = df_monthly.groupby(['party_rk', 'merchant_group_rk', 'category']).agg({'transaction_amt_rur': 'mean'})
df_monthly.reset_index(inplace=True)
print(df_monthly.shape)
df_monthly.head()

(53577, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,transaction_amt_rur
party_rk,merchant_group_rk,category,Unnamed: 3_level_1
8,100.0,Разные товары,311.0
8,113.0,Дом/Ремонт,69.0
8,205.0,Красота,3935.0
8,341.0,Супермаркеты,466.0
8,610.0,Супермаркеты,324.0


In [None]:
data_matrix = spsp.csr_matrix((df_monthly.transaction_amt_rur.values, 
                              (df_monthly.party_rk.values, ratings.movieid.values)))

In [64]:
df_monthly.party_rk.values.min()

8

In [65]:

df_monthly.head()

Unnamed: 0,party_rk,merchant_group_rk,category,transaction_amt_rur
0,8,100.0,Разные товары,311.0
1,8,113.0,Дом/Ремонт,69.0
2,8,205.0,Красота,3935.0
3,8,341.0,Супермаркеты,466.0
4,8,610.0,Супермаркеты,324.0


In [53]:
df1 = pd.DataFrame({'party_rk':         [1,            1,            2,            2],
                    'transaction_dttm': ['2019-01-08', '2019-01-09', '2019-01-08', '2019-02-08'],
                    'transaction_amt_rur': [100.,      250.,         300.,         600.],
                    'merchant_group_rk': [1,           1,            1,            1]})
df1.transaction_dttm = pd.to_datetime(df1.transaction_dttm)
df1.head()

Unnamed: 0,party_rk,transaction_dttm,transaction_amt_rur,merchant_group_rk
0,1,2019-01-08,100.0,1
1,1,2019-01-09,250.0,1
2,2,2019-01-08,300.0,1
3,2,2019-02-08,600.0,1


In [54]:
monthly = pd.Grouper(key='transaction_dttm',freq='M')
df1_monthly = df1.groupby([monthly, 'party_rk', 'merchant_group_rk']).agg({'transaction_amt_rur': 'sum'})
df1_monthly.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,transaction_amt_rur
transaction_dttm,party_rk,merchant_group_rk,Unnamed: 3_level_1
2019-01-31,1,1,350.0
2019-01-31,2,1,300.0
2019-02-28,2,1,600.0


In [55]:
df1_monthly.groupby(['party_rk', 'merchant_group_rk']).agg({'transaction_amt_rur': 'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,transaction_amt_rur
party_rk,merchant_group_rk,Unnamed: 2_level_1
1,1,350.0
2,1,450.0
