In [58]:
from collections import defaultdict, Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import joblib


In [59]:
EVENTS_PATH = os.path.join("data", "events.csv")

In [60]:
events = pd.read_csv(EVENTS_PATH, index_col=0)

In [61]:
events.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,...,prices,src,is_test,fake_impressions,fake_prices,reversed_clickout_step,clickout_step,clickout_max_step,dt,is_val
0,RE84XPQB447X,90fdaae2c58dc,1541030423,1,search for poi,Disneyland Paris,US,"Marne-la-Vallée, France",desktop,Best Value,...,,train,0,8887828|8414618|1231946|7323680|5125642|415471...,98|119|105|98|145|120|156|145|180|125|168|138|...,2,1,2,2018-11-01,0
1,WDPJ442S0Q1Z,cf0c96d88d2ef,1541030432,1,search for destination,"Birmingham, United Kingdom",UK,"Birmingham, United Kingdom",mobile,,...,,train,0,449896|147730|12522|152022|12544|12531|12516|2...,117|127|117|101|125|70|124|95|115|73|127|123|1...,3,1,3,2018-11-01,0
2,PCYB9NJCV2IS,47ba72122b04c,1541030442,1,search for destination,"Lisbon, Portugal",US,"Lisbon, Portugal",mobile,,...,,train,0,,,1,1,1,2018-11-01,0
3,RE84XPQB447X,90fdaae2c58dc,1541030454,2,interaction item rating,149255,US,"Marne-la-Vallée, France",desktop,,...,,train,0,8887828|8414618|1231946|7323680|5125642|415471...,98|119|105|98|145|120|156|145|180|125|168|138|...,4,1,4,2018-11-01,0
4,ORQ6U0KAMD2Q,73aaba45abc79,1541030468,1,interaction item rating,69865,US,"Red Lodge, USA",desktop,,...,,train,0,69511|69865|3867636|8258580|2217944|824256|893...,55|89|61|70|84|141|106|212|75|123|92,5,1,5,2018-11-01,0


In [62]:
clickout_events = events[events["action_type"] == "clickout item"].to_numpy()

In [63]:
events.columns

Index(['user_id', 'session_id', 'timestamp', 'step', 'action_type',
       'reference', 'platform', 'city', 'device', 'current_filters',
       'impressions', 'prices', 'src', 'is_test', 'fake_impressions',
       'fake_prices', 'reversed_clickout_step', 'clickout_step',
       'clickout_max_step', 'dt', 'is_val'],
      dtype='object')

In [64]:
impression_occurences = defaultdict(list)
records = list()

for co in tqdm(clickout_events):
    item_ids = co[10].split("|")
    prices = co[11].split("|")
    for item, price in zip(item_ids, prices):
        impression_occurences[item].append(int(price))
        records.append((item, int(price)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87286/87286 [00:02<00:00, 38213.98it/s]


In [65]:
item_prices = pd.DataFrame.from_records(records, columns=["item_id", "price"]).drop_duplicates()
item_prices.sort_values(["item_id", "price"], inplace=True)

In [74]:
item_prices["ascending_price_rank"] = item_prices.groupby("item_id")["price"].rank("max", ascending=True)
item_prices["descending_price_rank"] = item_prices.groupby("item_id")["price"].rank("max", ascending=False)
item_prices["ascending_price_rank_pct"] = item_prices.groupby("item_id")["price"].rank("max", pct=True, ascending=True)

aggreagate_feature_recs = list()

for item, prices in tqdm(impression_occurences.items()):
    aggreagate_feature_recs.append((item,min(prices),max(prices),len(prices)))
    
aggreagate_features = pd.DataFrame.from_records(aggreagate_feature_recs, columns=["item_id", "min_price", "max_price", "price_count"])



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 319679/319679 [00:00<00:00, 1019373.51it/s]


In [75]:
item_prices = pd.merge(item_prices, aggreagate_features, on="item_id")



In [77]:
item_prices["price_relative_to_min"] = item_prices["price"] / item_prices["min_price"]
item_prices["price_range"] = item_prices["max_price"] - item_prices["min_price"]
item_prices["price_range_div"] = item_prices["max_price"] / item_prices["min_price"]

In [78]:
item_prices.head()

Unnamed: 0,item_id,price,ascending_price_rank,descending_price_rank,ascending_price_rank_pct,min_price,max_price,price_count,price_relative_to_min,price_range,price_range_div
0,100000,232,1.0,1.0,1.0,232,232,1,1.0,0,1.0
1,10000022,38,1.0,1.0,1.0,38,38,1,1.0,0,1.0
2,1000005,66,1.0,4.0,0.25,66,82,7,1.0,16,1.242424
3,1000005,75,2.0,3.0,0.5,66,82,7,1.136364,16,1.242424
4,1000005,81,3.0,2.0,0.75,66,82,7,1.227273,16,1.242424


In [79]:
item_prices.to_csv(os.path.join("data", "item_prices.csv"), index=False)

BU KISIMDA MAKS FİYAT COLUMN'INI GİRERKEN GELECEK VERİSİ DAHİL ETMİŞ OLUYORUZ, AMAN DİKKAT.