In [10]:
import pandas as pd
from hnmchallenge.data_reader import DataReader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from hnmchallenge.dataset import Dataset
from hnmchallenge.stratified_dataset import StratifiedDataset
from hnmchallenge.filtered_dataset import FilterdDataset
from hnmchallenge.models.top_pop import TopPop
from hnmchallenge.evaluation.python_evaluation import map_at_k
from hnmchallenge.constant import *

In [11]:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [12]:
dataset = FilterdDataset()
dr = DataReader()

In [13]:
fd = dr.get_filtered_full_data()

In [14]:
fd

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,0,0,0.008458,2
1,2018-09-20,0,0,0.008458,2
2,2018-09-20,1,1,0.033881,1
3,2018-09-20,2,2,0.021593,1
4,2018-09-20,3,3,0.022017,2
...,...,...,...,...,...
14361083,2020-09-22,198005,20308,0.059305,2
14361084,2020-09-22,198005,16094,0.042356,2
14361085,2020-09-22,798833,21707,0.043203,1
14361086,2020-09-22,395317,4994,0.006763,1


In [15]:
fd = fd.drop_duplicates([DEFAULT_USER_COL, DEFAULT_ITEM_COL])

In [16]:
fd

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,0,0,0.008458,2
2,2018-09-20,1,1,0.033881,1
3,2018-09-20,2,2,0.021593,1
4,2018-09-20,3,3,0.022017,2
6,2018-09-20,4,4,0.032186,2
...,...,...,...,...,...
14361082,2020-09-22,59779,9279,0.042356,2
14361083,2020-09-22,198005,20308,0.059305,2
14361085,2020-09-22,798833,21707,0.043203,1
14361086,2020-09-22,395317,4994,0.006763,1


In [17]:
count_mb = fd.groupby(DEFAULT_ITEM_COL).count()
feature = count_mb.reset_index()[[DEFAULT_ITEM_COL, "t_dat"]].rename(
            columns={"t_dat": "popularity"})

In [18]:
feature

Unnamed: 0,article_id,popularity
0,0,3196
1,1,2074
2,2,1042
3,3,1595
4,4,3517
...,...,...
22064,22064,1
22065,22065,1
22066,22066,1
22067,22067,1


In [19]:
feature["popularity_score"]=(feature["popularity"]-feature["popularity"].min())/(feature["popularity"].max()-feature["popularity"].min())

In [20]:
feature

Unnamed: 0,article_id,popularity,popularity_score
0,0,3196,0.099070
1,1,2074,0.064279
2,2,1042,0.032279
3,3,1595,0.049426
4,4,3517,0.109023
...,...,...,...
22064,22064,1,0.000000
22065,22065,1,0.000000
22066,22066,1,0.000000
22067,22067,1,0.000000


In [21]:
feature['rank'] = feature['popularity_score'].rank(ascending=False,method='min').astype(int)

In [22]:
feature

Unnamed: 0,article_id,popularity,popularity_score,rank
0,0,3196,0.099070,540
1,1,2074,0.064279,1182
2,2,1042,0.032279,3093
3,3,1595,0.049426,1754
4,4,3517,0.109023,461
...,...,...,...,...
22064,22064,1,0.000000,21716
22065,22065,1,0.000000,21716
22066,22066,1,0.000000,21716
22067,22067,1,0.000000,21716


In [23]:
df=dr.get_filtered_full_data()

In [24]:
df= df.drop_duplicates([DEFAULT_USER_COL, DEFAULT_ITEM_COL])

In [25]:
df

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,0,0,0.008458,2
2,2018-09-20,1,1,0.033881,1
3,2018-09-20,2,2,0.021593,1
4,2018-09-20,3,3,0.022017,2
6,2018-09-20,4,4,0.032186,2
...,...,...,...,...,...
14361082,2020-09-22,59779,9279,0.042356,2
14361083,2020-09-22,198005,20308,0.059305,2
14361085,2020-09-22,798833,21707,0.043203,1
14361086,2020-09-22,395317,4994,0.006763,1


In [26]:
df["last_buy"] = df.groupby(DEFAULT_USER_COL)["t_dat"].transform(max)
df["first_buy"] = df.groupby(DEFAULT_USER_COL)["t_dat"].transform(min)
df["time_score"] = (df["t_dat"] - df["first_buy"]) / (
            df["last_buy"] - df["first_buy"]
        )

In [27]:
df=df.fillna(1)#df["tdiff"] =1/ ((df["last_buy"]-df["t_dat"]).dt.days) 

In [28]:
df

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,last_buy,first_buy,time_score
0,2018-09-20,0,0,0.008458,2,2020-04-15,2018-09-20,0.0
2,2018-09-20,1,1,0.033881,1,2020-08-12,2018-09-20,0.0
3,2018-09-20,2,2,0.021593,1,2020-03-10,2018-09-20,0.0
4,2018-09-20,3,3,0.022017,2,2019-09-28,2018-09-20,0.0
6,2018-09-20,4,4,0.032186,2,2018-09-20,2018-09-20,1.0
...,...,...,...,...,...,...,...,...
14361082,2020-09-22,59779,9279,0.042356,2,2020-09-22,2018-10-04,1.0
14361083,2020-09-22,198005,20308,0.059305,2,2020-09-22,2018-11-23,1.0
14361085,2020-09-22,798833,21707,0.043203,1,2020-09-22,2019-12-02,1.0
14361086,2020-09-22,395317,4994,0.006763,1,2020-09-22,2019-03-08,1.0


In [29]:
df["rank_time"] = df.groupby(DEFAULT_USER_COL)["time_score"].rank(ascending=False,method='min').astype(int)

In [30]:
df

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,last_buy,first_buy,time_score,rank_time
0,2018-09-20,0,0,0.008458,2,2020-04-15,2018-09-20,0.0,11
2,2018-09-20,1,1,0.033881,1,2020-08-12,2018-09-20,0.0,21
3,2018-09-20,2,2,0.021593,1,2020-03-10,2018-09-20,0.0,26
4,2018-09-20,3,3,0.022017,2,2019-09-28,2018-09-20,0.0,4
6,2018-09-20,4,4,0.032186,2,2018-09-20,2018-09-20,1.0,1
...,...,...,...,...,...,...,...,...,...
14361082,2020-09-22,59779,9279,0.042356,2,2020-09-22,2018-10-04,1.0,1
14361083,2020-09-22,198005,20308,0.059305,2,2020-09-22,2018-11-23,1.0,1
14361085,2020-09-22,798833,21707,0.043203,1,2020-09-22,2019-12-02,1.0,1
14361086,2020-09-22,395317,4994,0.006763,1,2020-09-22,2019-03-08,1.0,1


In [31]:
df.loc[df[DEFAULT_USER_COL]==1]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,last_buy,first_buy,time_score,rank_time
2,2018-09-20,1,1,0.033881,1,2020-08-12,2018-09-20,0.0,21
324870,2018-11-11,1,482,0.030153,2,2020-08-12,2018-09-20,0.075145,19
324871,2018-11-11,1,759,0.030153,2,2020-08-12,2018-09-20,0.075145,19
821881,2019-01-31,1,2006,0.033881,2,2020-08-12,2018-09-20,0.192197,18
1521318,2019-04-18,1,339,0.020492,2,2020-08-12,2018-09-20,0.303468,17
6310076,2020-02-06,1,6018,0.025407,1,2020-08-12,2018-09-20,0.728324,16
6357466,2020-02-08,1,7534,0.008458,2,2020-08-12,2018-09-20,0.731214,12
6357468,2020-02-08,1,6419,0.022864,1,2020-08-12,2018-09-20,0.731214,12
6357469,2020-02-08,1,7628,0.030492,2,2020-08-12,2018-09-20,0.731214,12
6357470,2020-02-08,1,981,0.038119,1,2020-08-12,2018-09-20,0.731214,12


In [32]:
feature

Unnamed: 0,article_id,popularity,popularity_score,rank
0,0,3196,0.099070,540
1,1,2074,0.064279,1182
2,2,1042,0.032279,3093
3,3,1595,0.049426,1754
4,4,3517,0.109023,461
...,...,...,...,...
22064,22064,1,0.000000,21716
22065,22065,1,0.000000,21716
22066,22066,1,0.000000,21716
22067,22067,1,0.000000,21716


In [33]:
final=pd.merge(df, feature, on =DEFAULT_ITEM_COL, how='left')

In [34]:
final

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,last_buy,first_buy,time_score,rank_time,popularity,popularity_score,rank
0,2018-09-20,0,0,0.008458,2,2020-04-15,2018-09-20,0.0,11,3196,0.099070,540
1,2018-09-20,1,1,0.033881,1,2020-08-12,2018-09-20,0.0,21,2074,0.064279,1182
2,2018-09-20,2,2,0.021593,1,2020-03-10,2018-09-20,0.0,26,1042,0.032279,3093
3,2018-09-20,3,3,0.022017,2,2019-09-28,2018-09-20,0.0,4,1595,0.049426,1754
4,2018-09-20,4,4,0.032186,2,2018-09-20,2018-09-20,1.0,1,3517,0.109023,461
...,...,...,...,...,...,...,...,...,...,...,...,...
12106975,2020-09-22,59779,9279,0.042356,2,2020-09-22,2018-10-04,1.0,1,1396,0.043256,2112
12106976,2020-09-22,198005,20308,0.059305,2,2020-09-22,2018-11-23,1.0,1,161,0.004961,12140
12106977,2020-09-22,798833,21707,0.043203,1,2020-09-22,2019-12-02,1.0,1,6,0.000155,20450
12106978,2020-09-22,395317,4994,0.006763,1,2020-09-22,2019-03-08,1.0,1,822,0.025457,4039


In [35]:
final.loc[final[DEFAULT_USER_COL]==1]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,last_buy,first_buy,time_score,rank_time,popularity,popularity_score,rank
1,2018-09-20,1,1,0.033881,1,2020-08-12,2018-09-20,0.0,21,2074,0.064279,1182
272583,2018-11-11,1,482,0.030153,2,2020-08-12,2018-09-20,0.075145,19,887,0.027473,3721
272584,2018-11-11,1,759,0.030153,2,2020-08-12,2018-09-20,0.075145,19,340,0.010512,8370
681809,2019-01-31,1,2006,0.033881,2,2020-08-12,2018-09-20,0.192197,18,5401,0.167442,183
1252469,2019-04-18,1,339,0.020492,2,2020-08-12,2018-09-20,0.303468,17,3211,0.099535,535
5245677,2020-02-06,1,6018,0.025407,1,2020-08-12,2018-09-20,0.728324,16,1563,0.048434,1804
5285422,2020-02-08,1,7534,0.008458,2,2020-08-12,2018-09-20,0.731214,12,1167,0.036155,2682
5285423,2020-02-08,1,6419,0.022864,1,2020-08-12,2018-09-20,0.731214,12,232,0.007163,10311
5285424,2020-02-08,1,7628,0.030492,2,2020-08-12,2018-09-20,0.731214,12,586,0.01814,5627
5285425,2020-02-08,1,981,0.038119,1,2020-08-12,2018-09-20,0.731214,12,911,0.028217,3601


In [36]:
user=dr.get_filtered_full_data()

In [37]:
user = user.drop_duplicates([DEFAULT_USER_COL])

In [38]:
user['temp']=1

In [39]:
user=user[[DEFAULT_USER_COL,'temp']]

In [40]:
user

Unnamed: 0,customer_id,temp
0,0,1
2,1,1
3,2,1
4,3,1
6,4,1
...,...,...
14360981,1136201,1
14361007,1136202,1
14361039,1136203,1
14361045,1136204,1


In [41]:
feature_100=feature[feature['rank']<=100]

In [42]:
feature_100['temp']=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_100['temp']=1


In [43]:
final1=pd.merge(user, feature_100, on='temp')

In [44]:
final1

Unnamed: 0,customer_id,temp,article_id,popularity,popularity_score,rank
0,0,1,5,10904,0.338078,34
1,0,1,7,16694,0.517612,11
2,0,1,8,8351,0.258915,67
3,0,1,11,22571,0.699845,4
4,0,1,19,16854,0.522574,10
...,...,...,...,...,...,...
113620595,1136205,1,3765,7826,0.242636,82
113620596,1136205,1,3830,7753,0.240372,84
113620597,1136205,1,3899,8393,0.260217,65
113620598,1136205,1,4475,7542,0.233829,95


In [45]:
final1=final1.drop('temp', axis=1)

In [46]:
final1

Unnamed: 0,customer_id,article_id,popularity,popularity_score,rank
0,0,5,10904,0.338078,34
1,0,7,16694,0.517612,11
2,0,8,8351,0.258915,67
3,0,11,22571,0.699845,4
4,0,19,16854,0.522574,10
...,...,...,...,...,...
113620595,1136205,3765,7826,0.242636,82
113620596,1136205,3830,7753,0.240372,84
113620597,1136205,3899,8393,0.260217,65
113620598,1136205,4475,7542,0.233829,95


In [47]:
final1[final1['customer_id']==1]

Unnamed: 0,customer_id,article_id,popularity,popularity_score,rank
100,1,5,10904,0.338078,34
101,1,7,16694,0.517612,11
102,1,8,8351,0.258915,67
103,1,11,22571,0.699845,4
104,1,19,16854,0.522574,10
...,...,...,...,...,...
195,1,3765,7826,0.242636,82
196,1,3830,7753,0.240372,84
197,1,3899,8393,0.260217,65
198,1,4475,7542,0.233829,95


In [48]:
final

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,last_buy,first_buy,time_score,rank_time,popularity,popularity_score,rank
0,2018-09-20,0,0,0.008458,2,2020-04-15,2018-09-20,0.0,11,3196,0.099070,540
1,2018-09-20,1,1,0.033881,1,2020-08-12,2018-09-20,0.0,21,2074,0.064279,1182
2,2018-09-20,2,2,0.021593,1,2020-03-10,2018-09-20,0.0,26,1042,0.032279,3093
3,2018-09-20,3,3,0.022017,2,2019-09-28,2018-09-20,0.0,4,1595,0.049426,1754
4,2018-09-20,4,4,0.032186,2,2018-09-20,2018-09-20,1.0,1,3517,0.109023,461
...,...,...,...,...,...,...,...,...,...,...,...,...
12106975,2020-09-22,59779,9279,0.042356,2,2020-09-22,2018-10-04,1.0,1,1396,0.043256,2112
12106976,2020-09-22,198005,20308,0.059305,2,2020-09-22,2018-11-23,1.0,1,161,0.004961,12140
12106977,2020-09-22,798833,21707,0.043203,1,2020-09-22,2019-12-02,1.0,1,6,0.000155,20450
12106978,2020-09-22,395317,4994,0.006763,1,2020-09-22,2019-03-08,1.0,1,822,0.025457,4039


In [49]:
final=final.drop(['popularity_score','rank','popularity','t_dat','price','sales_channel_id','last_buy','first_buy'], axis=1)

In [50]:
final2=pd.merge(final,final1,on=[DEFAULT_USER_COL,DEFAULT_ITEM_COL],how='outer')

In [51]:
final[final[DEFAULT_USER_COL]==1]

Unnamed: 0,customer_id,article_id,time_score,rank_time
1,1,1,0.0,21
272583,1,482,0.075145,19
272584,1,759,0.075145,19
681809,1,2006,0.192197,18
1252469,1,339,0.303468,17
5245677,1,6018,0.728324,16
5285422,1,7534,0.731214,12
5285423,1,6419,0.731214,12
5285424,1,7628,0.731214,12
5285425,1,981,0.731214,12


In [52]:
final2[final2[DEFAULT_USER_COL]==1]

Unnamed: 0,customer_id,article_id,time_score,rank_time,popularity,popularity_score,rank
1,1,1,0.000000,21.0,,,
272583,1,482,0.075145,19.0,,,
272584,1,759,0.075145,19.0,,,
681809,1,2006,0.192197,18.0,,,
1252469,1,339,0.303468,17.0,,,
...,...,...,...,...,...,...,...
12107169,1,3765,,,7826.0,0.242636,82.0
12107170,1,3830,,,7753.0,0.240372,84.0
12107171,1,3899,,,8393.0,0.260217,65.0
12107172,1,4475,,,7542.0,0.233829,95.0


In [53]:
mask = final2.isnull().any(axis=1)

In [54]:
final3=final2[~mask]

In [55]:
final3[final3[DEFAULT_USER_COL]==1]

Unnamed: 0,customer_id,article_id,time_score,rank_time,popularity,popularity_score,rank
6058205,1,1482,0.789017,8.0,25485.0,0.790202,3.0
7025576,1,2997,0.843931,6.0,7999.0,0.248,78.0
8390855,1,3161,0.910405,5.0,7422.0,0.230109,96.0


In [56]:
final2=final2.fillna(0)

In [57]:
final2

Unnamed: 0,customer_id,article_id,time_score,rank_time,popularity,popularity_score,rank
0,0,0,0.0,11.0,0.0,0.000000,0.0
1,1,1,0.0,21.0,0.0,0.000000,0.0
2,2,2,0.0,26.0,0.0,0.000000,0.0
3,3,3,0.0,4.0,0.0,0.000000,0.0
4,4,4,1.0,1.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...
124619397,1136205,3765,0.0,0.0,7826.0,0.242636,82.0
124619398,1136205,3830,0.0,0.0,7753.0,0.240372,84.0
124619399,1136205,3899,0.0,0.0,8393.0,0.260217,65.0
124619400,1136205,4475,0.0,0.0,7542.0,0.233829,95.0


In [58]:
final2=final2.drop(['rank_time','rank'],axis=1)

In [59]:
a=0.95
eps=1e-6
final2['score']=a*final2['time_score']+(1-a+eps)*final2['popularity_score']

In [60]:
final2

Unnamed: 0,customer_id,article_id,time_score,popularity,popularity_score,score
0,0,0,0.0,0.0,0.000000,0.000000
1,1,1,0.0,0.0,0.000000,0.000000
2,2,2,0.0,0.0,0.000000,0.000000
3,3,3,0.0,0.0,0.000000,0.000000
4,4,4,1.0,0.0,0.000000,0.950000
...,...,...,...,...,...,...
124619397,1136205,3765,0.0,7826.0,0.242636,0.012132
124619398,1136205,3830,0.0,7753.0,0.240372,0.012019
124619399,1136205,3899,0.0,8393.0,0.260217,0.013011
124619400,1136205,4475,0.0,7542.0,0.233829,0.011692


In [61]:
final2["rank"] = final2.groupby(DEFAULT_USER_COL)["score"].rank(ascending=False,method='min').astype(int)

In [62]:
final2

Unnamed: 0,customer_id,article_id,time_score,popularity,popularity_score,score,rank
0,0,0,0.0,0.0,0.000000,0.000000,108
1,1,1,0.0,0.0,0.000000,0.000000,118
2,2,2,0.0,0.0,0.000000,0.000000,122
3,3,3,0.0,0.0,0.000000,0.000000,104
4,4,4,1.0,0.0,0.000000,0.950000,1
...,...,...,...,...,...,...,...
124619397,1136205,3765,0.0,7826.0,0.242636,0.012132,88
124619398,1136205,3830,0.0,7753.0,0.240372,0.012019,90
124619399,1136205,3899,0.0,8393.0,0.260217,0.013011,71
124619400,1136205,4475,0.0,7542.0,0.233829,0.011692,101


In [63]:
final2=final2.drop(["time_score","popularity","popularity_score"],axis=1)

In [64]:
final2

Unnamed: 0,customer_id,article_id,score,rank
0,0,0,0.000000,108
1,1,1,0.000000,118
2,2,2,0.000000,122
3,3,3,0.000000,104
4,4,4,0.950000,1
...,...,...,...,...
124619397,1136205,3765,0.012132,88
124619398,1136205,3830,0.012019,90
124619399,1136205,3899,0.013011,71
124619400,1136205,4475,0.011692,101


In [65]:
cutoff = final2.groupby(DEFAULT_USER_COL).size().values

In [66]:
i=0
filter_indices = []
for cut in cutoff:
    filter_indices.extend(range(i, i+12))
    i=cut

In [67]:
final_df = final2.loc[filter_indices]

In [68]:
final_df

Unnamed: 0,customer_id,article_id,score,rank
0,0,0,0.0000,108
1,1,1,0.0000,118
2,2,2,0.0000,122
3,3,3,0.0000,104
4,4,4,0.9500,1
...,...,...,...,...
108,84,94,0.0000,104
109,85,95,0.0000,107
110,86,96,0.0000,126
111,87,64,0.0000,168


In [69]:
final_final_df = final_df.drop(["score","rank"] ,axis=1)

In [70]:
from hnmchallenge.submission_handler import SubmissionHandler
sh = SubmissionHandler()     

In [71]:
sh.create_submission_filtered_data([final_final_df], sub_name="Time_Pop")

OSError: Cannot save file into a non-existent directory: '/raid/home/aayush/git/HnMChallenge/submission'