In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import warnings as wn
import sklearn.preprocessing as skpre
import category_encoders as ce

wn.simplefilter( "ignore" )

In [3]:
loc_ftr = r"D:\FacundoTorraca\Documents\TP2_Machine_Learning_v4\Features\FeaturesSC"

## <span style="color:yellow"> **Preparamos el set de entramiento para sacar features** </span> 

In [4]:
loc_ts = r"D:\FacundoTorraca\Documents\TP2_Machine_Learning_v4\Training Sets"

In [5]:
evt_18_20 = pd.read_csv( loc_ts + "\\evt_18_20.csv" ); 
evt_21_23 = pd.read_csv( loc_ts + "\\evt_21_23.csv" );  

## <span style="color:yellow"> **Preparamos el ref_hash de cada ventana** </span> 

In [6]:
loc_lb = r"D:\FacundoTorraca\Documents\TP2_Machine_Learning_v4\Labels"

In [7]:
lb_ins_21_23 = pd.read_csv( loc_lb + "\\label_ins_21_23.csv" );
lb_ins_24_26 = pd.read_csv( loc_lb + "\\label_ins_24_26.csv" ); 

In [8]:
rh_trn = lb_ins_21_23[ ["ref_hash"] ]
rh_tst = lb_ins_24_26[ ["ref_hash"] ]

tg_trn = lb_ins_21_23[ ["21_23_sc"] ]
tg_tst = lb_ins_24_26[ ["24_26_sc"] ]

## <span style="color:yellow"> **=======================================================================================================** </span> 

## <span style="color:green"> **Horario del primer evento en ese ventana** </span> 

Al agregarle la transformacion senoidal para agregarle perdiodicidad a la hora, la prediccion del algoritmo fue peor por lo que decidimos no agregarla

In [8]:
hr_f_evt_18_20 = rh_trn.copy()
hr_f_evt_21_23 = rh_tst.copy()

first_evt_hour_18_20 = evt_18_20[ ["ref_hash","date"] ].sort_values( "date" ).drop_duplicates( subset = "ref_hash", keep = "first" )
first_evt_hour_21_23 = evt_21_23[ ["ref_hash","date"] ].sort_values( "date" ).drop_duplicates( subset = "ref_hash", keep = "first" )

first_evt_hour_18_20["time_to_frt_evt"] = ( pd.to_datetime( first_evt_hour_18_20["date"] ) -  dt.datetime( year = 2019, month = 4, day = 18 ) ).dt.total_seconds()
first_evt_hour_21_23["time_to_frt_evt"] = ( pd.to_datetime( first_evt_hour_21_23["date"] ) -  dt.datetime( year = 2019, month = 4, day = 21 ) ).dt.total_seconds()

first_evt_hour_18_20["hour_frt_evt"] = pd.to_datetime( first_evt_hour_18_20["date"] ).dt.hour
first_evt_hour_21_23["hour_frt_evt"] = pd.to_datetime( first_evt_hour_21_23["date"] ).dt.hour

first_evt_hour_18_20.drop( ["date", "time_to_frt_evt"], axis = 1, inplace = True )
first_evt_hour_21_23.drop( ["date", "time_to_frt_evt"], axis = 1, inplace = True )

hr_f_evt_18_20 = hr_f_evt_18_20.merge( first_evt_hour_18_20, how = "left", on = "ref_hash" )
hr_f_evt_21_23 = hr_f_evt_21_23.merge( first_evt_hour_21_23, how = "left", on = "ref_hash" )

#hr_f_evt_18_20["hour_frt_evt"] = hr_f_evt_18_20["hour_frt_evt"].apply( lambda x: np.sin( (x *np.pi)/24 ) )
#hr_f_evt_21_23["hour_frt_evt"] = hr_f_evt_21_23["hour_frt_evt"].apply( lambda x: np.sin( (x *np.pi)/24 ) )

hr_f_evt_18_20.to_csv( loc_ftr + "\\hr_f_evt_trn.csv", index = False )
hr_f_evt_21_23.to_csv( loc_ftr + "\\hr_f_evt_tst.csv", index = False )

Si bien hay varios valores NaN en el feature, lo decidimos probar de igual manera, ya que los algoritmos de boosting aceptan valores NaNs.

* https://datascience.stackexchange.com/questions/15305/how-does-xgboost-learn-what-are-the-inputs-for-missing-values/15306#15306
* The procedure is described in [their paper, section 3.4: Sparsity aware split-finding](https://arxiv.org/pdf/1603.02754v3.pdf).


## <span style="color:green"> **Cantidad de Eventos por dispositivo en la ventana previa a la conversion** </span>

In [9]:
cant_evt_18_20 = rh_trn.copy()
cant_evt_21_23 = rh_tst.copy()

cant_evt_18_20 = cant_evt_18_20.merge( evt_18_20["ref_hash"].value_counts().to_frame().reset_index().rename( columns = {"ref_hash": "cant_evt", "index":"ref_hash"} ), how = "left", on = "ref_hash" )
cant_evt_21_23 = cant_evt_21_23.merge( evt_21_23["ref_hash"].value_counts().to_frame().reset_index().rename( columns = {"ref_hash": "cant_evt", "index":"ref_hash"} ), how = "left", on = "ref_hash" )

cant_evt_18_20.fillna( 0, inplace = True )
cant_evt_21_23.fillna( 0, inplace = True )

cant_evt_18_20.to_csv( loc_ftr + "\\cant_evt_trn.csv", index = False )
cant_evt_21_23.to_csv( loc_ftr + "\\cant_evt_tst.csv", index = False )

## <span style="color:green"> **Cantidad de eventos atribuidas por dispositivo** </span> 

In [10]:
cevt_atr_18_20 = rh_trn.copy()
cevt_atr_21_23 = rh_tst.copy()

cant_atr_evt_18_20 = evt_18_20[["ref_hash", "attributed"]]; cant_atr_evt_18_20["attributed"] = cant_atr_evt_18_20["attributed"].apply( lambda x: 1 if x else 0 );
cant_atr_evt_21_23 = evt_21_23[["ref_hash", "attributed"]]; cant_atr_evt_21_23["attributed"] = cant_atr_evt_21_23["attributed"].apply( lambda x: 1 if x else 0 );

cant_atr_evt_18_20 = cant_atr_evt_18_20.groupby( "ref_hash" ).agg( "sum" )
cant_atr_evt_21_23 = cant_atr_evt_21_23.groupby( "ref_hash" ).agg( "sum" )

cevt_atr_18_20 = cevt_atr_18_20.merge( cant_atr_evt_18_20, how = "left", on = "ref_hash" ).fillna( 0 )
cevt_atr_21_23 = cevt_atr_21_23.merge( cant_atr_evt_21_23, how = "left", on = "ref_hash" ).fillna( 0 )

cevt_atr_18_20.to_csv( loc_ftr + "\\cevt_atr_trn.csv", index = False )
cevt_atr_21_23.to_csv( loc_ftr + "\\cevt_atr_tst.csv", index = False )

## <span style="color:green"> **Tiempo hasta el primer evento en ese ventana** </span> 

Le asignamos cuanto tiempo, en la ventana del 18-20, tardo en realizar su primer evento

In [11]:
frst_evt_18_20 = rh_trn.copy()
frst_evt_21_23 = rh_tst.copy()

first_event_rh_ins_18_20 = evt_18_20[ ["ref_hash","date"] ].sort_values( "date" ).drop_duplicates( subset = "ref_hash", keep = "first" )
first_event_rh_ins_21_23 = evt_21_23[ ["ref_hash","date"] ].sort_values( "date" ).drop_duplicates( subset = "ref_hash", keep = "first" )

first_event_rh_ins_18_20["time_to_frt_evt"] = ( pd.to_datetime( first_event_rh_ins_18_20["date"] ) -  dt.datetime( year = 2019, month = 4, day = 18 ) ).dt.total_seconds()
first_event_rh_ins_21_23["time_to_frt_evt"] = ( pd.to_datetime( first_event_rh_ins_21_23["date"] ) -  dt.datetime( year = 2019, month = 4, day = 21 ) ).dt.total_seconds()

first_event_rh_ins_18_20.drop( "date", axis = 1, inplace = True )
first_event_rh_ins_21_23.drop( "date", axis = 1, inplace = True )

frst_evt_18_20 = frst_evt_18_20.merge( first_event_rh_ins_18_20, how = "left", on = "ref_hash" )
frst_evt_21_23 = frst_evt_21_23.merge( first_event_rh_ins_21_23, how = "left", on = "ref_hash" )

#Los que tienen NaN es que nunca convirtieron. Los marcamos con el tiempo maximo
frst_evt_18_20.fillna( 3 * 24 * 3600, inplace = True )
frst_evt_21_23.fillna( 3 * 24 * 3600, inplace = True )

frst_evt_18_20.to_csv( loc_ftr + "\\frst_evt_trn.csv", index = False )
frst_evt_21_23.to_csv( loc_ftr + "\\frst_evt_tst.csv", index = False )

## <span style="color:green"> **Tipo de evento mas realizado por el dispositivo** </span> 

#### <span style="color:orange"> **Mean Encoding** </span> (Lo codificamos haciendo la cantidad de eventos que se realizaron de ese tipo sobre el total de eventos realizados)

In [12]:
kind_evt_18_20 = rh_trn.copy()
kind_evt_21_23 = rh_tst.copy()

main_kind_evt_18_20 = evt_18_20.groupby( by = ["ref_hash","kind"] ).agg( {"kind":"count"} ).rename( columns = {"kind":"cant_evt"} ).reset_index()
main_kind_evt_21_23 = evt_21_23.groupby( by = ["ref_hash","kind"] ).agg( {"kind":"count"} ).rename( columns = {"kind":"cant_evt"} ).reset_index()

main_kind_evt_18_20 = main_kind_evt_18_20.sort_values( by = ["ref_hash", "cant_evt"], ascending = True ).drop_duplicates( subset = "ref_hash", keep = "last" ); del( main_kind_evt_18_20["cant_evt"] )
main_kind_evt_21_23 = main_kind_evt_21_23.sort_values( by = ["ref_hash", "cant_evt"], ascending = True ).drop_duplicates( subset = "ref_hash", keep = "last" ); del( main_kind_evt_21_23["cant_evt"] ) 

kind_evt_18_20 = kind_evt_18_20.merge( main_kind_evt_18_20, how = "left", on = "ref_hash" )
kind_evt_21_23 = kind_evt_21_23.merge( main_kind_evt_21_23, how = "left", on = "ref_hash" )

cant_kind_evt_18_20 = evt_18_20[["ref_hash","kind"]].groupby( "kind" ).agg("count").reset_index().rename( columns = {"ref_hash":"cant_kind"} )
cant_kind_evt_21_23 = evt_21_23[["ref_hash","kind"]].groupby( "kind" ).agg("count").reset_index().rename( columns = {"ref_hash":"cant_kind"} )

kind_evt_18_20 = kind_evt_18_20.merge( cant_kind_evt_18_20, how = "left", on = "kind" ).drop( "kind", axis = 1 )
kind_evt_21_23 = kind_evt_21_23.merge( cant_kind_evt_21_23, how = "left", on = "kind" ).drop( "kind", axis = 1 )

kind_evt_18_20["kind_mean"] = kind_evt_18_20["cant_kind"] / len(evt_18_20); kind_evt_18_20.fillna( kind_evt_18_20["kind_mean"].isnull().sum() / len(evt_18_20), inplace = True ); del(kind_evt_18_20["cant_kind"])
kind_evt_21_23["kind_mean"] = kind_evt_21_23["cant_kind"] / len(evt_21_23); kind_evt_21_23.fillna( kind_evt_21_23["kind_mean"].isnull().sum() / len(evt_21_23), inplace = True ); del(kind_evt_21_23["cant_kind"])

kind_evt_18_20.to_csv( loc_ftr + "\\kind_evt_trn.csv", index = False )
kind_evt_21_23.to_csv( loc_ftr + "\\kind_evt_tst.csv", index = False )

## <span style="color:green"> **Aplicaciones que mas realizaron eventos cada usuario** </span>

#### <span style="color:orange"> **Mean Encoding** </span> (Usamos el promedio de la cantidad de veces que que es la app principal de algun dispositivo)

In [14]:
mapp_evt_18_20 = rh_trn.copy()
mapp_evt_21_23 = rh_tst.copy()

app_mas_evt_18_20 = evt_18_20.groupby( by = ["ref_hash","application_id"] ).agg( {"application_id":"count"} ).rename( columns = {"application_id":"cant_evt"} ).reset_index()
app_mas_evt_21_23 = evt_21_23.groupby( by = ["ref_hash","application_id"] ).agg( {"application_id":"count"} ).rename( columns = {"application_id":"cant_evt"} ).reset_index()

app_mas_evt_18_20 = app_mas_evt_18_20.sort_values( by = ["ref_hash", "cant_evt"], ascending = True ).drop_duplicates( subset = "ref_hash", keep = "last" ); del( app_mas_evt_18_20["cant_evt"] )
app_mas_evt_21_23 = app_mas_evt_21_23.sort_values( by = ["ref_hash", "cant_evt"], ascending = True ).drop_duplicates( subset = "ref_hash", keep = "last" ); del( app_mas_evt_21_23["cant_evt"] ) 

mapp_evt_18_20 = mapp_evt_18_20.merge( app_mas_evt_18_20, how = "left", on = "ref_hash" )
mapp_evt_21_23 = mapp_evt_21_23.merge( app_mas_evt_21_23, how = "left", on = "ref_hash" )

mapp_evt_18_20["to_count"] = 1; mapp_evt_18_20["application_id"] = mapp_evt_18_20[["application_id", "to_count"]].groupby("application_id").transform( "sum" ) / len(mapp_evt_18_20); del(mapp_evt_18_20["to_count"])
mapp_evt_21_23["to_count"] = 1; mapp_evt_21_23["application_id"] = mapp_evt_21_23[["application_id", "to_count"]].groupby("application_id").transform( "sum" ) / len(mapp_evt_21_23); del(mapp_evt_21_23["to_count"])

#Completamos los NaNs con el promedio de su cantidad
mapp_evt_18_20["application_id"].fillna( mapp_evt_18_20["application_id"].isnull().sum() / len( mapp_evt_18_20 ) , inplace = True )
mapp_evt_21_23["application_id"].fillna( mapp_evt_21_23["application_id"].isnull().sum() / len( mapp_evt_21_23 ), inplace = True )

mapp_evt_18_20.to_csv( loc_ftr + "\\mapp_evt_trn.csv", index = False )
mapp_evt_21_23.to_csv( loc_ftr + "\\mapp_evt_tst.csv", index = False )

## <span style="color:green"> **Cantidad de eventos realizados con WiFi** </span>  

In [15]:
wifi_evt_18_20 = rh_trn.copy()
wifi_evt_21_23 = rh_tst.copy()

cant_wifi_evt_18_20 = evt_18_20[["ref_hash", "wifi"]]; cant_wifi_evt_18_20["wifi"] = cant_wifi_evt_18_20["wifi"].apply( lambda x: 1 if x else 0 );
cant_wifi_evt_21_23 = evt_21_23[["ref_hash", "wifi"]]; cant_wifi_evt_21_23["wifi"] = cant_wifi_evt_21_23["wifi"].apply( lambda x: 1 if x else 0 );

cant_wifi_evt_18_20 = cant_wifi_evt_18_20.groupby( "ref_hash" ).agg( "sum" )
cant_wifi_evt_21_23 = cant_wifi_evt_21_23.groupby( "ref_hash" ).agg( "sum" )

wifi_evt_18_20 = wifi_evt_18_20.merge( cant_wifi_evt_18_20, how = "left", on = "ref_hash" ).fillna( 0 )
wifi_evt_21_23 = wifi_evt_21_23.merge( cant_wifi_evt_21_23, how = "left", on = "ref_hash" ).fillna( 0 )

wifi_evt_18_20.to_csv( loc_ftr + "\\wifi_evt_trn.csv", index = False )
wifi_evt_21_23.to_csv( loc_ftr + "\\wifi_evt_tst.csv", index = False )

## <span style="color:green"> **Tiempo hasta la ultimo evento en esa ventana** </span> 

In [17]:
last_evt_18_20 = rh_trn.copy()
last_evt_21_23 = rh_tst.copy()

last_event_rh_ins_18_20 = evt_18_20[ ["ref_hash","date"] ].sort_values( "date", ascending = False ).drop_duplicates( subset = "ref_hash", keep = "first" )
last_event_rh_ins_21_23 = evt_21_23[ ["ref_hash","date"] ].sort_values( "date", ascending = False ).drop_duplicates( subset = "ref_hash", keep = "first" )

last_event_rh_ins_18_20["time_to_lst_evt"] = ( pd.to_datetime( last_event_rh_ins_18_20["date"] ) -  dt.datetime( year = 2019, month = 4, day = 18 ) ).dt.total_seconds()
last_event_rh_ins_21_23["time_to_lst_evt"] = ( pd.to_datetime( last_event_rh_ins_21_23["date"] ) -  dt.datetime( year = 2019, month = 4, day = 21 ) ).dt.total_seconds()

last_event_rh_ins_18_20.drop( "date", axis = 1, inplace = True )
last_event_rh_ins_21_23.drop( "date", axis = 1, inplace = True )

last_evt_18_20 = last_evt_18_20.merge( last_event_rh_ins_18_20, how = "left", on = "ref_hash" )
last_evt_21_23 = last_evt_21_23.merge( last_event_rh_ins_21_23, how = "left", on = "ref_hash" )

#Los que tienen NaN es que nunca convirtieron. Los marcamos con el tiempo maximo
last_evt_18_20.fillna( 3 * 24 * 3600, inplace = True )
last_evt_21_23.fillna( 3 * 24 * 3600, inplace = True )

last_evt_18_20.to_csv( loc_ftr + "\\last_evt_trn.csv", index = False )
last_evt_21_23.to_csv( loc_ftr + "\\last_evt_tst.csv", index = False )

## <span style="color:green"> **Hizo eventos entre 21 hs y 3 hs (Noche)** </span>

#### <span style="color:Orange"> **One-Hot Encoding** </span> 

In [18]:
evt_18_20['date'] = pd.to_datetime(evt_18_20['date'])
evt_21_23['date'] = pd.to_datetime(evt_21_23['date'])

evt_18_20['evt_21_3'] = (evt_18_20['date'].dt.hour < 4) | (evt_18_20['date'].dt.hour > 20)
evt_21_23['evt_21_3'] = (evt_21_23['date'].dt.hour < 4) | (evt_18_20['date'].dt.hour > 20)

evt_night_18_20 = rh_trn.copy()
evt_night_21_23 = rh_tst.copy()

hour_mode_18_20 = evt_18_20.groupby('ref_hash').agg({'evt_21_3':'sum'}).reset_index()
hour_mode_21_23 = evt_21_23.groupby('ref_hash').agg({'evt_21_3':'sum'}).reset_index()

evt_night_18_20 = evt_night_18_20.merge( hour_mode_18_20, how = "left", on = "ref_hash" )
evt_night_21_23 = evt_night_21_23.merge( hour_mode_21_23, how = "left", on = "ref_hash" )

evt_night_18_20["evt_21_3"] =  (evt_night_18_20["evt_21_3"] > 1).astype('int8')
evt_night_21_23["evt_21_3"] =  (evt_night_21_23["evt_21_3"] > 1).astype('int8')

evt_night_18_20.to_csv( loc_ftr + "\\evt_nght_trn.csv", index = False )
evt_night_21_23.to_csv( loc_ftr + "\\evt_nght_tst.csv", index = False )

del evt_18_20['evt_21_3']
del evt_21_23['evt_21_3']

## <span style="color:green"> **Hizo eventos entre 4 hs y 10 hs (Mañana)** </span>

#### <span style="color:Orange"> **One-Hot Encoding** </span> 

In [19]:
evt_18_20['date'] = pd.to_datetime(evt_18_20['date'])
evt_21_23['date'] = pd.to_datetime(evt_21_23['date'])

evt_18_20['evt_4_10'] = (evt_18_20['date'].dt.hour < 11) & (evt_18_20['date'].dt.hour > 3)
evt_21_23['evt_4_10'] = (evt_21_23['date'].dt.hour < 11) & (evt_18_20['date'].dt.hour > 3)

evt_morn_18_20 = rh_trn.copy()
evt_morn_21_23 = rh_tst.copy()

hour_mode_18_20 = evt_18_20.groupby('ref_hash').agg({'evt_4_10':'sum'}).reset_index()
hour_mode_21_23 = evt_21_23.groupby('ref_hash').agg({'evt_4_10':'sum'}).reset_index()

evt_morn_18_20 = evt_morn_18_20.merge( hour_mode_18_20, how = "left", on = "ref_hash" )
evt_morn_21_23 = evt_morn_21_23.merge( hour_mode_21_23, how = "left", on = "ref_hash" )

evt_morn_18_20["evt_4_10"] =  (evt_morn_18_20["evt_4_10"] > 1).astype('int8')
evt_morn_21_23["evt_4_10"] =  (evt_morn_21_23["evt_4_10"] > 1).astype('int8')

evt_morn_18_20.to_csv( loc_ftr + "\\evt_morn_trn.csv", index = False )
evt_morn_21_23.to_csv( loc_ftr + "\\evt_morn_tst.csv", index = False )

del evt_18_20['evt_4_10']
del evt_21_23['evt_4_10']

## <span style="color:green"> **Hizo eventos entre 11 hs y 15 hs (Mediodia)** </span>

#### <span style="color:Orange"> **One-Hot Encoding** </span> 

In [20]:
evt_18_20['date'] = pd.to_datetime(evt_18_20['date'])
evt_21_23['date'] = pd.to_datetime(evt_21_23['date'])

evt_18_20['evt_11_15'] = (evt_18_20['date'].dt.hour < 16) & (evt_18_20['date'].dt.hour > 10)
evt_21_23['evt_11_15'] = (evt_21_23['date'].dt.hour < 16) & (evt_18_20['date'].dt.hour > 10)

evt_midday_18_20 = rh_trn.copy()
evt_midday_21_23 = rh_tst.copy()

hour_mode_18_20 = evt_18_20.groupby('ref_hash').agg({'evt_11_15':'sum'}).reset_index()
hour_mode_21_23 = evt_21_23.groupby('ref_hash').agg({'evt_11_15':'sum'}).reset_index()

evt_midday_18_20 = evt_midday_18_20.merge( hour_mode_18_20, how = "left", on = "ref_hash" )
evt_midday_21_23 = evt_midday_21_23.merge( hour_mode_21_23, how = "left", on = "ref_hash" )

evt_midday_18_20["evt_11_15"] =  (evt_midday_18_20["evt_11_15"] > 1).astype('int8')
evt_midday_21_23["evt_11_15"] =  (evt_midday_21_23["evt_11_15"] > 1).astype('int8')

evt_midday_18_20.to_csv( loc_ftr + "\\evt_mday_tst.csv", index = False )
evt_midday_21_23.to_csv( loc_ftr + "\\evt_mday_trn.csv", index = False )

del evt_18_20['evt_11_15']
del evt_21_23['evt_11_15']

## <span style="color:green"> **Hizo eventos entre 16 hs y 20 hs (Tarde)** </span>

#### <span style="color:Orange"> **One-Hot Encoding** </span> 

In [21]:
evt_18_20['date'] = pd.to_datetime(evt_18_20['date'])
evt_21_23['date'] = pd.to_datetime(evt_21_23['date'])

evt_18_20['evt_16_20'] = (evt_18_20['date'].dt.hour < 21) & (evt_18_20['date'].dt.hour > 15)
evt_21_23['evt_16_20'] = (evt_21_23['date'].dt.hour < 21) & (evt_18_20['date'].dt.hour > 15)

evt_after_18_20 = rh_trn.copy()
evt_after_21_23 = rh_tst.copy()

hour_mode_18_20 = evt_18_20.groupby('ref_hash').agg({'evt_16_20':'sum'}).reset_index()
hour_mode_21_23 = evt_21_23.groupby('ref_hash').agg({'evt_16_20':'sum'}).reset_index()

evt_after_18_20 = evt_after_18_20.merge( hour_mode_18_20, how = "left", on = "ref_hash" )
evt_after_21_23 = evt_after_21_23.merge( hour_mode_21_23, how = "left", on = "ref_hash" )

evt_after_18_20["evt_16_20"] =  (evt_after_18_20["evt_16_20"] > 1).astype('int8')
evt_after_21_23["evt_16_20"] =  (evt_after_21_23["evt_16_20"] > 1).astype('int8')

evt_after_18_20.to_csv( loc_ftr + "\\evt_aftr_trn.csv", index = False )
evt_after_21_23.to_csv( loc_ftr + "\\evt_aftr_tst.csv", index = False )

del evt_18_20['evt_16_20']
del evt_21_23['evt_16_20']

## <span style="color:green"> **Tiempo hasta que el usuario realizo algun evento evento del TOP 3** </span>

In [9]:
tt_3_evt_18_20 = rh_trn.copy()
tt_3_evt_21_23 = rh_tst.copy()

cond_1_18_20 = evt_18_20['kind'] == evt_18_20['kind'].value_counts().head().index[0]; cond_1_21_23 = evt_21_23['kind'] == evt_21_23['kind'].value_counts().head().index[0];
cond_2_18_20 = evt_18_20['kind'] == evt_18_20['kind'].value_counts().head().index[1]; cond_2_21_23 = evt_21_23['kind'] == evt_21_23['kind'].value_counts().head().index[1];
cond_3_18_20 = evt_18_20['kind'] == evt_18_20['kind'].value_counts().head().index[2]; cond_3_21_23 = evt_21_23['kind'] == evt_21_23['kind'].value_counts().head().index[2];

k1 = evt_18_20.loc[ cond_1_18_20 | cond_2_18_20 | cond_3_18_20, ["ref_hash", "date"] ].sort_values( by = "date", ascending = True ).drop_duplicates( subset = ["ref_hash"], keep = "first" )
k2 = evt_21_23.loc[ cond_1_21_23 | cond_2_21_23 | cond_3_21_23, ["ref_hash", "date"] ].sort_values( by = "date", ascending = True ).drop_duplicates( subset = ["ref_hash"], keep = "first" )

tt_3_evt_18_20 = tt_3_evt_18_20.merge( k1, how = "left", on = "ref_hash" )
tt_3_evt_21_23 = tt_3_evt_21_23.merge( k2, how = "left", on = "ref_hash" )

tt_3_evt_18_20["date"] = ( pd.to_datetime( tt_3_evt_18_20["date"] ) - dt.datetime( 2019, 4, 18 ) ).dt.total_seconds()
tt_3_evt_21_23["date"] = ( pd.to_datetime( tt_3_evt_21_23["date"] ) - dt.datetime( 2019, 4, 21 ) ).dt.total_seconds()

tt_3_evt_18_20.fillna( 3*24*3600, inplace = True ) #Quiere decir que no lo realizo
tt_3_evt_21_23.fillna( 3*24*3600, inplace = True ) #Quiere decir que no lo realizo

tt_3_evt_21_23.to_csv( loc_ftr + "\\tt_3_evt_trn.csv", index = False )
tt_3_evt_21_23.to_csv( loc_ftr + "\\tt_3_evt_tst.csv", index = False )

## <span style="color:green"> **Realizo algun evento del TOP 3** </span>
Se corroboro inicialmente que los top3 tipos de eventos coinciden en las 3 ventanas

#### <span style="color:orange"> **OneHotEncoding** </span>

In [22]:
top3_evt_18_20 = rh_trn.copy()
top3_evt_21_23 = rh_tst.copy()

cond_1_18_20 = evt_18_20['kind'] == evt_18_20['kind'].value_counts().head().index[0]; cond_1_21_23 = evt_21_23['kind'] == evt_21_23['kind'].value_counts().head().index[0];
cond_2_18_20 = evt_18_20['kind'] == evt_18_20['kind'].value_counts().head().index[1]; cond_2_21_23 = evt_21_23['kind'] == evt_21_23['kind'].value_counts().head().index[1];
cond_3_18_20 = evt_18_20['kind'] == evt_18_20['kind'].value_counts().head().index[2]; cond_3_21_23 = evt_21_23['kind'] == evt_21_23['kind'].value_counts().head().index[2];

k1 = evt_18_20.loc[ cond_1_18_20 | cond_2_18_20 | cond_3_18_20, ["ref_hash", "kind"] ]; k1["kind"] = k1["kind"].astype( "category" )
k2 = evt_21_23.loc[ cond_1_21_23 | cond_2_21_23 | cond_3_21_23, ["ref_hash", "kind"] ]; k2["kind"] = k2["kind"].astype( "category" )

k1 = k1.groupby( "ref_hash" ).agg( {"kind":"value_counts"} ).unstack( level = 1 ).fillna( 0 )
k2 = k2.groupby( "ref_hash" ).agg( {"kind":"value_counts"} ).unstack( level = 1 ).fillna( 0 )

top3_evt_18_20 = top3_evt_18_20.merge( k1, how = "left", on = "ref_hash" ).fillna( 0 ); top3_evt_18_20.columns = ["ref_hash", "evt1", "evt2", "evt3" ]
top3_evt_21_23 = top3_evt_21_23.merge( k2, how = "left", on = "ref_hash" ).fillna( 0 ); top3_evt_21_23.columns = ["ref_hash", "evt1", "evt2", "evt3" ]

top3_evt_18_20["evt1"] = top3_evt_18_20["evt1"].apply( lambda x: 1 if x > 0 else 0 ); top3_evt_21_23["evt1"] = top3_evt_21_23["evt1"].apply( lambda x: 1 if x > 0 else 0 );
top3_evt_18_20["evt2"] = top3_evt_18_20["evt2"].apply( lambda x: 1 if x > 0 else 0 ); top3_evt_21_23["evt2"] = top3_evt_21_23["evt1"].apply( lambda x: 1 if x > 0 else 0 );
top3_evt_18_20["evt3"] = top3_evt_18_20["evt3"].apply( lambda x: 1 if x > 0 else 0 ); top3_evt_21_23["evt3"] = top3_evt_21_23["evt3"].apply( lambda x: 1 if x > 0 else 0 );

top3_evt_18_20.to_csv( loc_ftr + "\\top3_evt_trn.csv", index = False )
top3_evt_21_23.to_csv( loc_ftr + "\\top3_evt_tst.csv", index = False )

## <span style="color:green"> **Realizo un evento de la aplicacion: 210** </span>
Esta aplicacion es la mas popular respecto a eventos, con mas del 25% del total

In [23]:
evt_a210_18_20 = rh_trn.copy()
evt_a210_21_23 = rh_tst.copy()

do_evt_app_210_18_20 = evt_18_20[ ["ref_hash", "application_id"] ]; do_evt_app_210_18_20["do_it_210"] = do_evt_app_210_18_20["application_id"].apply( lambda x: 1 if x == 210 else 0 )
do_evt_app_210_21_23 = evt_21_23[ ["ref_hash", "application_id"] ]; do_evt_app_210_21_23["do_it_210"] = do_evt_app_210_21_23["application_id"].apply( lambda x: 1 if x == 210 else 0 )

evt_a210_18_20 = evt_a210_18_20.merge( do_evt_app_210_18_20, how = "left", on = "ref_hash" ); del( evt_a210_18_20["application_id"] )
evt_a210_21_23 = evt_a210_21_23.merge( do_evt_app_210_21_23, how = "left", on = "ref_hash" ); del( evt_a210_21_23["application_id"] )

evt_a210_18_20.to_csv( loc_ftr + "\\evt_a210_trn.csv", index = False )
evt_a210_21_23.to_csv( loc_ftr + "\\evt_a210_tst.csv", index = False )

## <span style="color:green"> **Realizo un evento de ID: 1** </span>
Esta id de evento es la mas popular en todas las ventanas

In [24]:
evt_id01_18_20 = rh_trn.copy()
evt_id01_21_23 = rh_tst.copy()

do_evt_id_1_18_20 = evt_18_20[ ["ref_hash", "event_id"] ]; do_evt_id_1_18_20["do_id_1"] = do_evt_id_1_18_20["event_id"].apply( lambda x: 1 if x == 1 else 0 )
do_evt_id_1_21_23 = evt_21_23[ ["ref_hash", "event_id"] ]; do_evt_id_1_21_23["do_id_1"] = do_evt_id_1_21_23["event_id"].apply( lambda x: 1 if x == 1 else 0 )

evt_id01_18_20 = evt_id01_18_20.merge( do_evt_id_1_18_20, how = "left", on = "ref_hash" ); del( evt_id01_18_20["event_id"] )
evt_id01_21_23 = evt_id01_21_23.merge( do_evt_id_1_21_23, how = "left", on = "ref_hash" ); del( evt_id01_21_23["event_id"] )

evt_a210_18_20.to_csv( loc_ftr + "\\evt_id01_trn.csv", index = False )
evt_a210_21_23.to_csv( loc_ftr + "\\evt_id01_tst.csv", index = False )