In [1]:
import pandas as pd
import numpy as np
import os.path

In [2]:
df = pd.read_csv('data/transactions_train.csv')

In [3]:
df.shape

(31788324, 5)

In [4]:
df.tail()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1
31788323,2020-09-22,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,898573003,0.033881,2


In [5]:
df['t_dat'].max()

'2020-09-22'

In [6]:
df['t_dat'].unique()

array(['2018-09-20', '2018-09-21', '2018-09-22', '2018-09-23',
       '2018-09-24', '2018-09-25', '2018-09-26', '2018-09-27',
       '2018-09-28', '2018-09-29', '2018-09-30', '2018-10-01',
       '2018-10-02', '2018-10-03', '2018-10-04', '2018-10-05',
       '2018-10-06', '2018-10-07', '2018-10-08', '2018-10-09',
       '2018-10-10', '2018-10-11', '2018-10-12', '2018-10-13',
       '2018-10-14', '2018-10-15', '2018-10-16', '2018-10-17',
       '2018-10-18', '2018-10-19', '2018-10-20', '2018-10-21',
       '2018-10-22', '2018-10-23', '2018-10-24', '2018-10-25',
       '2018-10-26', '2018-10-27', '2018-10-28', '2018-10-29',
       '2018-10-30', '2018-10-31', '2018-11-01', '2018-11-02',
       '2018-11-03', '2018-11-04', '2018-11-05', '2018-11-06',
       '2018-11-07', '2018-11-08', '2018-11-09', '2018-11-10',
       '2018-11-11', '2018-11-12', '2018-11-13', '2018-11-14',
       '2018-11-15', '2018-11-16', '2018-11-17', '2018-11-18',
       '2018-11-19', '2018-11-20', '2018-11-21', '2018-

In [10]:
len(df['customer_id'].unique())

1362281

In [7]:
validation_week = ['2020-09-09', '2020-09-10', '2020-09-11', '2020-09-12', '2020-09-13', '2020-09-14', '2020-09-15']
test_week = ['2020-09-16', '2020-09-17', '2020-09-18', '2020-09-19', '2020-09-20', '2020-09-21', '2020-09-22']

In [8]:
df[df['t_dat'].isin(test_week)].shape

(240311, 5)

In [9]:
df[df['t_dat'].isin(validation_week)].shape

(255241, 5)

In [12]:
all_weeks = list(df['t_dat'].unique())
train_weeks = [w for w in all_weeks if w not in validation_week+test_week]

In [13]:
len(all_weeks)

734

In [14]:
len(train_weeks)

720

In [16]:
df[df['t_dat'].isin(validation_week)].to_csv('data/validation_data.csv', index=False)
df[df['t_dat'].isin(test_week)].to_csv('data/test_data.csv', index=False)

In [17]:
df[df['t_dat'].isin(all_weeks)].to_csv('data/train_data.csv.zip', 
                                       compression={'method': 'zip', 'archive_name': 'train_data.csv'}, 
                                       index=False)

In [18]:
test_df = df[df['t_dat'].isin(test_week)]

In [19]:
test_df['article_id'].value_counts()

924243001    852
924243002    635
918522001    609
923758001    592
866731001    552
            ... 
767798001      1
768915002      1
578472001      1
762286001      1
864339003      1
Name: article_id, Length: 17986, dtype: int64

In [20]:
top = test_df['article_id'].value_counts().reset_index()
top[:12]

Unnamed: 0,index,article_id
0,924243001,852
1,924243002,635
2,918522001,609
3,923758001,592
4,866731001,552
5,909370001,537
6,751471001,526
7,915529003,495
8,915529005,491
9,448509014,490


In [22]:
weekly_popular = top[:12]['index'].values
weekly_popular

array([924243001, 924243002, 918522001, 923758001, 866731001, 909370001,
       751471001, 915529003, 915529005, 448509014, 762846027, 714790020])

In [24]:
recs = ' '.join(map(str, weekly_popular))
recs

'924243001 924243002 918522001 923758001 866731001 909370001 751471001 915529003 915529005 448509014 762846027 714790020'

In [26]:
customers = pd.read_csv('data/customers.csv')
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [27]:
baseline = pd.DataFrame(customers['customer_id'])

In [28]:
baseline['prediction'] = recs
baseline.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,924243001 924243002 918522001 923758001 866731...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,924243001 924243002 918522001 923758001 866731...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,924243001 924243002 918522001 923758001 866731...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,924243001 924243002 918522001 923758001 866731...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,924243001 924243002 918522001 923758001 866731...


In [29]:
baseline.to_csv('data/baseline_prediction.csv', index=False)

In [None]:
# Score: 0.0000