In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import warnings as wn
import sklearn.preprocessing as skpre
import category_encoders as ce

wn.simplefilter( "ignore" )

In [2]:
loc_ftr = r"D:\FacundoTorraca\Documents\TP2_ML\FeaturesComp\FeaturesSC"

## <span style="color:yellow"> **Preparamos el set de entramiento para sacar features** </span> 

In [3]:
loc_ts = r"D:\FacundoTorraca\Documents\TP2_ML\Training Sets"

In [4]:
clk_trn = pd.read_csv( loc_ts + "\\clk_21_23.csv" ); 
clk_tst = pd.read_csv( loc_ts + "\\clk_24_26.csv" ); 

## <span style="color:yellow"> **Preparamos el ref_hash de cada ventana** </span> 

In [5]:
loc_lb = r"D:\FacundoTorraca\Documents\TP2_ML\Labels"

In [6]:
lb_ins_trn = pd.read_csv( loc_lb + "\\label_ins_24_26.csv" ); 
lb_ins_tst = pd.read_csv( loc_lb + "\\target.csv" ); 

In [7]:
rh_trn = lb_ins_trn[ ["ref_hash"] ]
rh_tst = lb_ins_tst[ ["ref_hash"] ]

tg_trn = lb_ins_trn[ ["24_26_sc"] ]

## <span style="color:yellow"> **=======================================================================================================** </span> 

## <span style="color:green"> **Cantidad de Clicks por dispositivo en la ventana previa a la conversion** </span>

In [8]:
cant_clk_trn = rh_trn.copy()
cant_clk_tst = rh_tst.copy()

cant_clk_trn = cant_clk_trn.merge( clk_trn["ref_hash"].value_counts().to_frame().reset_index().rename( columns = {"ref_hash": "cant_clk", "index":"ref_hash"} ), how = "left", on = "ref_hash" )
cant_clk_tst = cant_clk_tst.merge( clk_tst["ref_hash"].value_counts().to_frame().reset_index().rename( columns = {"ref_hash": "cant_clk", "index":"ref_hash"} ), how = "left", on = "ref_hash" )

cant_clk_trn.fillna( 0, inplace = True )
cant_clk_tst.fillna( 0, inplace = True )

cant_clk_trn.to_csv( loc_ftr + "\\cant_clk_trn.csv", index = False )
cant_clk_tst.to_csv( loc_ftr + "\\cant_clk_tst.csv", index = False )

## <span style="color:green"> **Tiempo hasta el primer click en ese ventana** </span> 

Le asignamos cuanto tiempo, en la ventana del 18-20, tardo en realizar su primer click

In [9]:
frst_clk_trn = rh_trn.copy()
frst_clk_tst = rh_tst.copy()

first_click_rh_ins_trn = clk_trn[ ["ref_hash","date"] ].sort_values( "date" ).drop_duplicates( subset = "ref_hash", keep = "first" )
first_click_rh_ins_tst = clk_tst[ ["ref_hash","date"] ].sort_values( "date" ).drop_duplicates( subset = "ref_hash", keep = "first" )

first_click_rh_ins_trn["time_to_frt_evt"] = ( pd.to_datetime( first_click_rh_ins_trn["date"] ) -  dt.datetime( year = 2019, month = 4, day = 21 ) ).dt.total_seconds()
first_click_rh_ins_tst["time_to_frt_evt"] = ( pd.to_datetime( first_click_rh_ins_tst["date"] ) -  dt.datetime( year = 2019, month = 4, day = 24 ) ).dt.total_seconds()

first_click_rh_ins_trn.drop( "date", axis = 1, inplace = True )
first_click_rh_ins_tst.drop( "date", axis = 1, inplace = True )

frst_clk_trn = frst_clk_trn.merge( first_click_rh_ins_trn, how = "left", on = "ref_hash" )
frst_clk_tst = frst_clk_tst.merge( first_click_rh_ins_tst, how = "left", on = "ref_hash" )

#Los que tienen NaN es que nunca convirtieron. Los marcamos con el tiempo maximo
frst_clk_trn.fillna( 3 * 24 * 3600, inplace = True )
frst_clk_tst.fillna( 3 * 24 * 3600, inplace = True )

frst_clk_trn.to_csv( loc_ftr + "\\frst_clk_trn.csv", index = False )
frst_clk_tst.to_csv( loc_ftr + "\\frst_clk_tst.csv", index = False )

## <span style="color:green"> **Tiempo hasta el ultimo click en esa ventana** </span> 

In [10]:
last_clk_trn = rh_trn.copy()
last_clk_tst = rh_tst.copy()

last_click_rh_ins_trn = clk_trn[ ["ref_hash","date"] ].sort_values( "date", ascending = False ).drop_duplicates( subset = "ref_hash", keep = "first" )
last_click_rh_ins_tst = clk_tst[ ["ref_hash","date"] ].sort_values( "date", ascending = False ).drop_duplicates( subset = "ref_hash", keep = "first" )

last_click_rh_ins_trn["time_to_lst_evt"] = ( pd.to_datetime( last_click_rh_ins_trn["date"] ) -  dt.datetime( year = 2019, month = 4, day = 21 ) ).dt.total_seconds()
last_click_rh_ins_tst["time_to_lst_evt"] = ( pd.to_datetime( last_click_rh_ins_tst["date"] ) -  dt.datetime( year = 2019, month = 4, day = 24 ) ).dt.total_seconds()

last_click_rh_ins_trn.drop( "date", axis = 1, inplace = True )
last_click_rh_ins_tst.drop( "date", axis = 1, inplace = True )

last_clk_trn = last_clk_trn.merge( last_click_rh_ins_trn, how = "left", on = "ref_hash" )
last_clk_tst = last_clk_tst.merge( last_click_rh_ins_tst, how = "left", on = "ref_hash" )

#Los que tienen NaN es que nunca convirtieron. Los marcamos con el tiempo maximo
last_clk_trn.fillna( 3 * 24 * 3600, inplace = True )
last_clk_tst.fillna( 3 * 24 * 3600, inplace = True )

last_clk_trn.to_csv( loc_ftr + "\\last_clk_trn.csv", index = False )
last_clk_tst.to_csv( loc_ftr + "\\last_clk_tst.csv", index = False )

## <span style="color:green"> **Recibio clicks entre 16 hs y 20 hs (Tarde)** </span> 

#### <span style="color:Orange"> **One-hot Encoding** </span> 

In [11]:
clk_trn['clk_16_20'] = ( pd.to_datetime( clk_trn['date'] ).dt.hour > 15) & ( pd.to_datetime( clk_trn['date'] ).dt.hour < 21) 
clk_tst['clk_16_20'] = ( pd.to_datetime( clk_tst['date'] ).dt.hour > 15) & ( pd.to_datetime( clk_tst['date'] ).dt.hour < 21)

clk_after_trn = rh_trn.copy()
clk_after_tst = rh_tst.copy()

hour_mode_trn = clk_trn.groupby('ref_hash').agg({'clk_16_20':'sum'}).reset_index()
hour_mode_tst = clk_tst.groupby('ref_hash').agg({'clk_16_20':'sum'}).reset_index()

clk_after_trn = clk_after_trn.merge( hour_mode_trn, how = "left", on = "ref_hash" )
clk_after_tst = clk_after_tst.merge( hour_mode_tst, how = "left", on = "ref_hash" )

clk_after_trn["clk_16_20"] = (clk_after_trn["clk_16_20"] > 1).astype('int8')
clk_after_tst["clk_16_20"] = (clk_after_tst["clk_16_20"] > 1).astype('int8')

clk_after_trn.to_csv( loc_ftr + "\\clk_aftr_trn.csv", index = False )
clk_after_tst.to_csv( loc_ftr + "\\clk_aftr_tst.csv", index = False )

del clk_trn['clk_16_20']
del clk_tst['clk_16_20']

## <span style="color:green"> **Recibio clikcs entre 21 hs y 3 hs (Noche)** </span> 

#### <span style="color:Orange"> **One-hot Encoding** </span> 

In [12]:
clk_trn['clk_21_3'] = ( pd.to_datetime( clk_trn['date'] ).dt.hour < 4) | ( pd.to_datetime( clk_trn['date'] ).dt.hour > 20)
clk_tst['clk_21_3'] = ( pd.to_datetime( clk_tst['date'] ).dt.hour < 4) | ( pd.to_datetime( clk_tst['date'] ).dt.hour > 20)

clk_night_trn = rh_trn.copy()
clk_night_tst = rh_tst.copy()

hour_mode_trn = clk_trn.groupby('ref_hash').agg({'clk_21_3':'sum'}).reset_index()
hour_mode_tst = clk_tst.groupby('ref_hash').agg({'clk_21_3':'sum'}).reset_index()

clk_night_trn = clk_night_trn.merge( hour_mode_trn, how = "left", on = "ref_hash" )
clk_night_tst = clk_night_tst.merge( hour_mode_tst, how = "left", on = "ref_hash" )

clk_night_trn["clk_21_3"] =  (clk_night_trn["clk_21_3"] > 1).astype('int8')
clk_night_tst["clk_21_3"] =  (clk_night_tst["clk_21_3"] > 1).astype('int8')

clk_night_trn.to_csv( loc_ftr + "\\clk_nght_trn.csv", index = False )
clk_night_tst.to_csv( loc_ftr + "\\clk_nght_tst.csv", index = False )

del clk_trn['clk_21_3']
del clk_tst['clk_21_3']

## <span style="color:green"> **Recibio clicks entre 11 hs y 15 hs (Medio dia)** </span> 

#### <span style="color:Orange"> **One-hot Encoding** </span> 

In [13]:
clk_trn['clk_11_15'] = ( pd.to_datetime( clk_trn['date'] ).dt.hour > 10) & ( pd.to_datetime( clk_trn['date'] ).dt.hour < 16) 
clk_tst['clk_11_15'] = ( pd.to_datetime( clk_tst['date'] ).dt.hour > 10) & ( pd.to_datetime( clk_tst['date'] ).dt.hour < 16)

clk_midday_trn = rh_trn.copy()
clk_midday_tst = rh_tst.copy()

hour_mode_trn = clk_trn.groupby('ref_hash').agg({'clk_11_15':'sum'}).reset_index()
hour_mode_tst = clk_tst.groupby('ref_hash').agg({'clk_11_15':'sum'}).reset_index()

clk_midday_trn = clk_midday_trn.merge( hour_mode_trn, how = "left", on = "ref_hash" )
clk_midday_tst = clk_midday_tst.merge( hour_mode_tst, how = "left", on = "ref_hash" )

clk_midday_trn["clk_11_15"] =  (clk_midday_trn["clk_11_15"] > 1).astype('int8')
clk_midday_tst["clk_11_15"] =  (clk_midday_tst["clk_11_15"] > 1).astype('int8')

clk_midday_trn.to_csv( loc_ftr + "\\clk_mday_trn.csv", index = False )
clk_midday_tst.to_csv( loc_ftr + "\\clk_mday_tst.csv", index = False )

del clk_trn['clk_11_15']
del clk_tst['clk_11_15']

## <span style="color:green"> **Recibio clicks entre 4 hs y 10 hs (Mañana)** </span> 

#### <span style="color:Orange"> **One-hot Encoding** </span> 

In [14]:
clk_trn['clk_4_10'] = ( pd.to_datetime( clk_trn['date'] ).dt.hour > 3) & ( pd.to_datetime( clk_trn['date'] ).dt.hour < 11) 
clk_tst['clk_4_10'] = ( pd.to_datetime( clk_tst['date'] ).dt.hour > 3) & ( pd.to_datetime( clk_tst['date'] ).dt.hour < 11)

clk_morn_trn = rh_trn.copy()
clk_morn_tst = rh_tst.copy()

hour_mode_trn = clk_trn.groupby('ref_hash').agg({'clk_4_10':'sum'}).reset_index()
hour_mode_tst = clk_tst.groupby('ref_hash').agg({'clk_4_10':'sum'}).reset_index()

clk_morn_trn = clk_morn_trn.merge( hour_mode_trn, how = "left", on = "ref_hash" )
clk_morn_tst = clk_morn_tst.merge( hour_mode_tst, how = "left", on = "ref_hash" )

clk_morn_trn["ins_4_10"] =  (clk_morn_trn["clk_4_10"] > 1).astype('int8')
clk_morn_tst["ins_4_10"] =  (clk_morn_tst["clk_4_10"] > 1).astype('int8')

clk_morn_trn.to_csv( loc_ftr + "\\clk_morn_trn.csv", index = False )
clk_morn_tst.to_csv( loc_ftr + "\\clk_morn_tst.csv", index = False )

del clk_trn['clk_4_10']
del clk_tst['clk_4_10']