In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import warnings as wn
import sklearn.preprocessing as skpre
import category_encoders as ce

wn.simplefilter( "ignore" )

In [2]:
loc_ftr = r"D:\FacundoTorraca\Documents\TP2_Machine_Learning_v4\Features\FeaturesSC"

## <span style="color:yellow"> **Preparamos el set de entramiento para sacar features** </span> 

In [3]:
loc_ts = r"D:\FacundoTorraca\Documents\TP2_Machine_Learning_v4\Training Sets"

In [4]:
ins_18_20 = pd.read_csv( loc_ts + "\\ins_18_20.csv" ); ins_18_20["_sc"] = pd.to_datetime(ins_18_20["date"]) - dt.datetime( year = 2019, month = 4, day = 18 ); 
ins_18_20["_sc"] = ins_18_20["_sc"].dt.total_seconds();
ins_21_23 = pd.read_csv( loc_ts + "\\ins_21_23.csv" ); ins_21_23["_sc"] = pd.to_datetime(ins_21_23["date"]) - dt.datetime( year = 2019, month = 4, day = 21 ); 
ins_21_23["_sc"] = ins_21_23["_sc"].dt.total_seconds(); 

In [5]:
ins_18_20_frt = ins_18_20.sort_values( by = ["ref_hash","_sc"], ascending = True ).drop_duplicates( subset = ["ref_hash"], keep = "first" )
ins_21_23_frt = ins_21_23.sort_values( by = ["ref_hash","_sc"], ascending = True ).drop_duplicates( subset = ["ref_hash"], keep = "first" )

## <span style="color:yellow"> **Preparamos el ref_hash de cada ventana** </span> 

In [6]:
loc_lb = r"D:\FacundoTorraca\Documents\TP2_Machine_Learning_v4\Labels"

In [7]:
lb_ins_21_23 = pd.read_csv( loc_lb + "\\label_ins_21_23.csv" );
lb_ins_24_26 = pd.read_csv( loc_lb + "\\label_ins_24_26.csv" ); 

In [8]:
rh_trn = lb_ins_21_23[ ["ref_hash"] ]
rh_tst = lb_ins_24_26[ ["ref_hash"] ]

tg_trn = lb_ins_21_23[ ["21_23_sc"] ]
tg_tst = lb_ins_24_26[ ["24_26_sc"] ]

## <span style="color:yellow"> **=======================================================================================================** </span> 

## <span style="color:green"> **Cantidad de instalaciones por dispositivo en la ventana previa a la conversion** </span> 

In [9]:
cant_ins_trn = rh_trn.copy()
cant_ins_tst = rh_tst.copy()

cant_ins_trn = cant_ins_trn.merge( ins_18_20["ref_hash"].value_counts().to_frame().reset_index().rename( columns = {"ref_hash": "cant_ins", "index":"ref_hash"} ), how = "left", on = "ref_hash" ).fillna( 0 )
cant_ins_tst = cant_ins_tst.merge( ins_21_23["ref_hash"].value_counts().to_frame().reset_index().rename( columns = {"ref_hash": "cant_ins", "index":"ref_hash"} ), how = "left", on = "ref_hash" ).fillna( 0 )

cant_ins_trn.to_csv( loc_ftr + "\\cant_ins_trn.csv", index = False )
cant_ins_tst.to_csv( loc_ftr + "\\cant_ins_tst.csv", index = False )

## <span style="color:green"> **Aplicacion mas instalada por el usuario** </span>

#### <span style="color:orange"> **Mean Encoding** </span> (Usamos el promedio de la cantidad de veces que que es la app principal de algun dispositivo)

In [10]:
main_app_trn = rh_trn.copy()
main_app_tst = rh_tst.copy()

app_mas_ins_trn = ins_18_20.groupby( by = ["ref_hash","application_id"] ).agg( {"application_id":"count"} ).rename( columns = {"application_id":"cant_ins"} ).reset_index()
app_mas_ins_tst = ins_21_23.groupby( by = ["ref_hash","application_id"] ).agg( {"application_id":"count"} ).rename( columns = {"application_id":"cant_ins"} ).reset_index()

app_mas_ins_trn = app_mas_ins_trn.sort_values( by = ["ref_hash", "cant_ins"], ascending = True ).drop_duplicates( subset = "ref_hash", keep = "last" ); del( app_mas_ins_trn["cant_ins"] )
app_mas_ins_tst = app_mas_ins_tst.sort_values( by = ["ref_hash", "cant_ins"], ascending = True ).drop_duplicates( subset = "ref_hash", keep = "last" ); del( app_mas_ins_tst["cant_ins"] ) 

main_app_trn = main_app_trn.merge( app_mas_ins_trn, how = "left", on = "ref_hash" )
main_app_tst = main_app_tst.merge( app_mas_ins_tst, how = "left", on = "ref_hash" )

main_app_trn["to_count"] = 1; main_app_trn["application_id"] = main_app_trn[["application_id", "to_count"]].groupby("application_id").transform( "sum" ) / len(main_app_trn); del(main_app_trn["to_count"])
main_app_tst["to_count"] = 1; main_app_tst["application_id"] = main_app_tst[["application_id", "to_count"]].groupby("application_id").transform( "sum" ) / len(main_app_tst); del(main_app_tst["to_count"])

main_app_trn.fillna( main_app_trn.isnull().sum()/len(main_app_trn) )
main_app_tst.fillna( main_app_tst.isnull().sum()/len(main_app_tst) )

main_app_trn.to_csv( loc_ftr + "\\main_app_trn.csv", index = False )
main_app_tst.to_csv( loc_ftr + "\\main_app_tst.csv", index = False )

## <span style="color:green"> **Cantidad de instalaciones implicitas por dispositivo** </span> 

In [11]:
cins_imp_trn = rh_trn.copy()
cins_imp_tst = rh_tst.copy()

cins_imp_trn["cant_imp"] = ins_18_20["implicit"].apply( lambda x: 1 if x else 0 ); cins_imp_trn = cins_imp_trn.groupby( "ref_hash" ).agg( {"cant_imp":"sum"} ).reset_index()
cins_imp_tst["cant_imp"] = ins_21_23["implicit"].apply( lambda x: 1 if x else 0 ); cins_imp_tst = cins_imp_tst.groupby( "ref_hash" ).agg( {"cant_imp":"sum"} ).reset_index()

cins_imp_trn.to_csv( loc_ftr + "\\cins_imp_trn.csv", index = False )
cins_imp_tst.to_csv( loc_ftr + "\\cins_imp_tst.csv", index = False )

## <span style="color:green"> **Cantidad de instalaciones atribuidas por dispositivo** </span> 

In [12]:
cins_atr_trn = rh_trn.copy()
cins_atr_tst = rh_tst.copy()

cins_atr_trn["cant_atr"] = ins_18_20["attributed"].apply( lambda x: 1 if x else 0 ); cins_atr_trn = cins_atr_trn.groupby( "ref_hash" ).agg( {"cant_atr":"sum"} ).reset_index()
cins_atr_tst["cant_atr"] = ins_21_23["attributed"].apply( lambda x: 1 if x else 0 ); cins_atr_tst = cins_atr_tst.groupby( "ref_hash" ).agg( {"cant_atr":"sum"} ).reset_index()

cins_atr_trn.to_csv( loc_ftr + "\\cins_atr_trn.csv", index = False )
cins_atr_tst.to_csv( loc_ftr + "\\cins_atr_tst.csv", index = False )

## <span style="color:green"> **Tiempo hasta la primera instalacion en la ventana anterior** </span>
Se define con el tiempo maximo si la persona no realizo una instalacion en la ventana anterior

In [13]:
frst_ins_trn = rh_trn.copy()
frst_ins_tst = rh_tst.copy()

In [14]:
time_to_first_ind_18_20 = ins_18_20_frt[ ["ref_hash","_sc"] ]
time_to_first_ind_21_23 = ins_21_23_frt[ ["ref_hash","_sc"] ]

In [15]:
frst_ins_trn = frst_ins_trn.merge( time_to_first_ind_18_20, how = "left", on = "ref_hash" ).fillna( 3*3600*24 )
frst_ins_tst = frst_ins_tst.merge( time_to_first_ind_21_23, how = "left", on = "ref_hash" ).fillna( 3*3600*24 )

In [17]:
frst_ins_trn.to_csv( loc_ftr + "\\frst_ins_trn.csv", index = False )
frst_ins_tst.to_csv( loc_ftr + "\\frst_ins_tst.csv", index = False )

## <span style="color:green"> **Instalo entre 4 hs y 10 hs (Mañana)** </span> 

#### <span style="color:Orange"> **One-hot Encoding** </span> 

In [19]:
ins_18_20['ins_4_10'] = ( pd.to_datetime( ins_18_20['date'] ).dt.hour > 3) & ( pd.to_datetime( ins_18_20['date'] ).dt.hour < 11) 
ins_21_23['ins_4_10'] = ( pd.to_datetime( ins_21_23['date'] ).dt.hour > 3) & ( pd.to_datetime( ins_21_23['date'] ).dt.hour < 11)

ins_morn_18_20 = rh_trn.copy()
ins_morn_21_23 = rh_tst.copy()

hour_mode_18_20 = ins_18_20.groupby('ref_hash').agg({'ins_4_10':'sum'}).reset_index()
hour_mode_21_23 = ins_21_23.groupby('ref_hash').agg({'ins_4_10':'sum'}).reset_index()

ins_morn_18_20 = ins_morn_18_20.merge( hour_mode_18_20, how = "left", on = "ref_hash" )
ins_morn_21_23 = ins_morn_21_23.merge( hour_mode_21_23, how = "left", on = "ref_hash" )

ins_morn_18_20["ins_4_10"] =  (ins_morn_18_20["ins_4_10"] > 1).astype('int8')
ins_morn_21_23["ins_4_10"] =  (ins_morn_21_23["ins_4_10"] > 1).astype('int8')

ins_morn_18_20.to_csv( loc_ftr + "\\ins_morn_trn.csv", index = False )
ins_morn_21_23.to_csv( loc_ftr + "\\ins_morn_tst.csv", index = False )

del ins_18_20['ins_4_10']
del ins_21_23['ins_4_10']

## <span style="color:green"> **Instalo entre 11 hs y 15 hs (Medio dia)** </span> 

#### <span style="color:Orange"> **One-hot Encoding** </span> 

In [21]:
ins_18_20['ins_11_15'] = ( pd.to_datetime( ins_18_20['date'] ).dt.hour > 10) & ( pd.to_datetime( ins_18_20['date'] ).dt.hour < 16) 
ins_21_23['ins_11_15'] = ( pd.to_datetime( ins_21_23['date'] ).dt.hour > 10) & ( pd.to_datetime( ins_21_23['date'] ).dt.hour < 16)

ins_midday_18_20 = rh_trn.copy()
ins_midday_21_23 = rh_tst.copy()

hour_mode_18_20 = ins_18_20.groupby('ref_hash').agg({'ins_11_15':'sum'}).reset_index()
hour_mode_21_23 = ins_21_23.groupby('ref_hash').agg({'ins_11_15':'sum'}).reset_index()

ins_midday_18_20 = ins_midday_18_20.merge( hour_mode_18_20, how = "left", on = "ref_hash" )
ins_midday_21_23 = ins_midday_21_23.merge( hour_mode_21_23, how = "left", on = "ref_hash" )

ins_midday_18_20["ins_11_15"] =  (ins_midday_18_20["ins_11_15"] > 1).astype('int8')
ins_midday_21_23["ins_11_15"] =  (ins_midday_21_23["ins_11_15"] > 1).astype('int8')

ins_midday_18_20.to_csv( loc_ftr + "\\ins_mday_trn.csv", index = False )
ins_midday_21_23.to_csv( loc_ftr + "\\ins_mday_tst.csv", index = False )

del ins_18_20['ins_11_15']
del ins_21_23['ins_11_15']

## <span style="color:green"> **Instalo entre 16 hs y 20 hs (Tarde)** </span> 

#### <span style="color:Orange"> **One-hot Encoding** </span> 

In [23]:
ins_18_20['ins_16_20'] = ( pd.to_datetime( ins_18_20['date'] ).dt.hour > 15) & ( pd.to_datetime( ins_18_20['date'] ).dt.hour < 21) 
ins_21_23['ins_16_20'] = ( pd.to_datetime( ins_21_23['date'] ).dt.hour > 15) & ( pd.to_datetime( ins_21_23['date'] ).dt.hour < 21)

ins_after_18_20 = rh_trn.copy()
ins_after_21_23 = rh_tst.copy()

hour_mode_18_20 = ins_18_20.groupby('ref_hash').agg({'ins_16_20':'sum'}).reset_index()
hour_mode_21_23 = ins_21_23.groupby('ref_hash').agg({'ins_16_20':'sum'}).reset_index()

ins_after_18_20 = ins_after_18_20.merge( hour_mode_18_20, how = "left", on = "ref_hash" )
ins_after_21_23 = ins_after_21_23.merge( hour_mode_21_23, how = "left", on = "ref_hash" )

ins_after_18_20["ins_16_20"] =  (ins_after_18_20["ins_16_20"] > 1).astype('int8')
ins_after_21_23["ins_16_20"] =  (ins_after_21_23["ins_16_20"] > 1).astype('int8')

ins_after_18_20.to_csv( loc_ftr + "\\ins_aftr_trn.csv", index = False )
ins_after_21_23.to_csv( loc_ftr + "\\ins_aftr_tst.csv", index = False )

del ins_18_20['ins_16_20']
del ins_21_23['ins_16_20']

## <span style="color:green"> **Instalo entre 21 hs y 3 hs (Noche)** </span> 

#### <span style="color:Orange"> **One-hot Encoding** </span> 

In [24]:
ins_18_20['ins_21_3'] = ( pd.to_datetime( ins_18_20['date'] ).dt.hour > 20) | ( pd.to_datetime( ins_18_20['date'] ).dt.hour < 4) 
ins_21_23['ins_21_3'] = ( pd.to_datetime( ins_21_23['date'] ).dt.hour > 20) | ( pd.to_datetime( ins_21_23['date'] ).dt.hour < 4)

ins_night_18_20 = rh_trn.copy()
ins_night_21_23 = rh_tst.copy()

hour_mode_18_20 = ins_18_20.groupby('ref_hash').agg({'ins_21_3':'sum'}).reset_index()
hour_mode_21_23 = ins_21_23.groupby('ref_hash').agg({'ins_21_3':'sum'}).reset_index()

ins_night_18_20 = ins_night_18_20.merge( hour_mode_18_20, how = "left", on = "ref_hash" )
ins_night_21_23 = ins_night_21_23.merge( hour_mode_21_23, how = "left", on = "ref_hash" )

ins_night_18_20["ins_21_3"] =  (ins_night_18_20["ins_21_3"] > 1).astype('int8')
ins_night_21_23["ins_21_3"] =  (ins_night_21_23["ins_21_3"] > 1).astype('int8')

ins_night_18_20.to_csv( loc_ftr + "\\ins_nght_trn.csv", index = False )
ins_night_21_23.to_csv( loc_ftr + "\\ins_nght_tst.csv", index = False )

del ins_18_20['ins_21_3']
del ins_21_23['ins_21_3']

## <span style="color:green"> **Realizo alguna instalacion del TOP 3 de apps** </span>
Se corroboro inicialmente que las top3 aplicaciones mas instaladas coinciden en las 3 ventanas

#### <span style="color:orange"> **OneHotEncoding** </span>

In [28]:
top3_app_18_20 = rh_trn.copy()
top3_app_21_23 = rh_tst.copy()

cond_1_18_20 = ins_18_20['application_id'] == ins_18_20['application_id'].value_counts().head().index[0]; cond_1_21_23 = ins_21_23['application_id'] == ins_21_23['application_id'].value_counts().head().index[0];
cond_2_18_20 = ins_18_20['application_id'] == ins_18_20['application_id'].value_counts().head().index[1]; cond_2_21_23 = ins_21_23['application_id'] == ins_21_23['application_id'].value_counts().head().index[1];
cond_3_18_20 = ins_18_20['application_id'] == ins_18_20['application_id'].value_counts().head().index[2]; cond_3_21_23 = ins_21_23['application_id'] == ins_21_23['application_id'].value_counts().head().index[2];

m1 = ins_18_20.loc[ cond_1_18_20 | cond_2_18_20 | cond_3_18_20, ["ref_hash", "application_id"] ]; m1["application_id"] = m1["application_id"].astype( "category" )
m2 = ins_21_23.loc[ cond_1_21_23 | cond_2_21_23 | cond_3_21_23, ["ref_hash", "application_id"] ]; m2["application_id"] = m2["application_id"].astype( "category" )

m1 = m1.groupby( "ref_hash" ).agg( {"application_id":"value_counts"} ).unstack( level = 1 )
m2 = m2.groupby( "ref_hash" ).agg( {"application_id":"value_counts"} ).unstack( level = 1 )

top3_app_18_20 = top3_app_18_20.merge( m1, how = "left", on = "ref_hash" ); top3_app_18_20.columns = ["ref_hash", "app1", "app2", "app3" ]
top3_app_21_23 = top3_app_21_23.merge( m2, how = "left", on = "ref_hash" ); top3_app_21_23.columns = ["ref_hash", "app1", "app2", "app3" ]

top3_app_18_20["app1"] = top3_app_18_20["app1"].apply( lambda x: 1 if x > 0 else 0 ); top3_app_21_23["app1"] = top3_app_21_23["app1"].apply( lambda x: 1 if x > 0 else 0 );
top3_app_18_20["app2"] = top3_app_18_20["app2"].apply( lambda x: 1 if x > 0 else 0 ); top3_app_21_23["app2"] = top3_app_21_23["app1"].apply( lambda x: 1 if x > 0 else 0 );
top3_app_18_20["app3"] = top3_app_18_20["app3"].apply( lambda x: 1 if x > 0 else 0 ); top3_app_21_23["app3"] = top3_app_21_23["app3"].apply( lambda x: 1 if x > 0 else 0 );

top3_app_18_20.to_csv( loc_ftr + "\\top3_app_trn.csv", index = False )
top3_app_21_23.to_csv( loc_ftr + "\\top3_app_tst.csv", index = False )