In [1]:
import pandas as pd
import numpy as np
import warnings as wn
import datetime as dt

wn.simplefilter("ignore")

In [2]:
dtypes_auc = {"ref_type_id": np.int8, "source_id": np.int16, "device_id": np.int64 }

dtypes_ins = { "application_id": np.int16, "ref_type":"category", "click_hash":"category", "device_model":"category",
               "device_countrycode":"category", "device_brand": "category", "session_user_agent":"category",
               "kind":"category", "wifi":"category", "device_language":"category"}      

dtypes_clk = { "advertiser_id": np.int8, "action_id": np.float16, "source_id": np.int8, "country_code": "category", 
               "latitude": np.float16, "longitude": np.float16, "wifi_connection": "category", "carrier_id": np.float16,
               "trans_id": "object", "os_major":np.float32, "brand": np.float16, "touchX": np.float16,
               "touchY": np.float16  }

dtypes_evt = { "event_id":np.int32, "application_id": np.int32, "attributed":"category", "device_countrycode":"category",
               "device_os_version":"category", "device_brand": np.float32, "device_model": "category",
               "device_city":"category", "session_user_agent":"category", "user_agent":np.float32, "carrier":"category",
               "kind":"category", "wifi":"category", "connection_type":"category", "device_language":"category",
               "trans_id":"object" }

# Leo los CSV

In [3]:
#loc_to_read = r"D:\FacundoTorraca\Documents\TP2_Machine_Learning\CSV"

In [5]:
auc = pd.read_csv("auctions.csv.gzip", compression = "gzip", dtype = dtypes_auc ) 
auc["date"] = pd.to_datetime( auc["date"], errors = "coerce" )

In [6]:
ins = pd.read_csv( 'installs.csv.gzip', compression = 'gzip', dtype = dtypes_ins )
ins["created"] = pd.to_datetime( ins["created"], errors = "coerce" )
ins = ins.rename( columns = {"created":"date"} )

In [7]:
evt = pd.read_csv( 'events.csv.gzip', compression = "gzip", dtype = dtypes_evt )
evt["date"] = pd.to_datetime( evt["date"], errors = "coerce" )

In [8]:
clk = pd.read_csv( 'clicks.csv.gzip', compression = "gzip" )
clk["touchX"] = clk["touchX"].fillna(0)
clk["touchY"] = clk["touchY"].fillna(0)
clk["touchX"] = pd.to_numeric( clk["touchX"], downcast = "float", errors = "coerce" )
clk["touchY"] = pd.to_numeric( clk["touchY"], downcast = "float", errors = "coerce" )
clk["created"] = pd.to_datetime( clk["created"], errors = "coerce" )
clk = clk.rename( columns = {"created":"date"} )

# Los separo en ventanas de 3 dias

In [9]:
ins_18_20 = ins.loc[ ins['date'].dt.day.between(18,20) ]; ins_18_20["ventana"] = "18_20";
ins_21_23 = ins.loc[ ins['date'].dt.day.between(21,23) ]; ins_21_23["ventana"] = "21_23";
ins_24_26 = ins.loc[ ins['date'].dt.day.between(24,26) ]; ins_24_26["ventana"] = "24_26";

In [10]:
clk_18_20 = clk.loc[ clk['date'].dt.day.between(18,20) ]; clk_18_20["ventana"] = "18_20";
clk_21_23 = clk.loc[ clk['date'].dt.day.between(21,23) ]; clk_21_23["ventana"] = "21_23";
clk_24_26 = clk.loc[ clk['date'].dt.day.between(24,26) ]; clk_24_26["ventana"] = "24_26";

In [11]:
evt_18_20 = evt.loc[ evt['date'].dt.day.between(18,20) ]; evt_18_20["ventana"] = "18_20";
evt_21_23 = evt.loc[ evt['date'].dt.day.between(21,23) ]; evt_21_23["ventana"] = "21_23";
evt_24_26 = evt.loc[ evt['date'].dt.day.between(24,26) ]; evt_24_26["ventana"] = "24_26";

In [12]:
auc_18_20 = auc.loc[ auc['date'].dt.day.between(18,20) ]; auc_18_20["ventana"] = "18_20";
auc_21_23 = auc.loc[ auc['date'].dt.day.between(21,23) ]; auc_21_23["ventana"] = "21_23";
auc_24_26 = auc.loc[ auc['date'].dt.day.between(24,26) ]; auc_24_26["ventana"] = "24_26";

# Calculamos el "ST" y "SD" para cada ventana

In [13]:
ins_18_20["_sc"] = ins_18_20["date"] - dt.datetime( year = 2019, month = 4, day = 18 ); ins_18_20["_sc"] = ins_18_20["_sc"].dt.total_seconds();
ins_21_23["_sc"] = ins_21_23["date"] - dt.datetime( year = 2019, month = 4, day = 21 ); ins_21_23["_sc"] = ins_21_23["_sc"].dt.total_seconds(); 
ins_24_26["_sc"] = ins_24_26["date"] - dt.datetime( year = 2019, month = 4, day = 24 ); ins_24_26["_sc"] = ins_24_26["_sc"].dt.total_seconds(); 

In [14]:
auc_18_20["_st"] = auc_18_20["date"] - dt.datetime( year = 2019, month = 4, day = 18 ); auc_18_20["_st"] = auc_18_20["_st"].dt.total_seconds();
auc_21_23["_st"] = auc_21_23["date"] - dt.datetime( year = 2019, month = 4, day = 21 ); auc_21_23["_st"] = auc_21_23["_st"].dt.total_seconds(); 
auc_24_26["_st"] = auc_24_26["date"] - dt.datetime( year = 2019, month = 4, day = 24 ); auc_24_26["_st"] = auc_24_26["_st"].dt.total_seconds(); 

# Guardamos los DataFrames 

In [15]:
loc_to_save = r"D:\TP2_Machine_Learning\Training Sets"

In [16]:
ins_18_20.to_csv( loc_to_save + "\\ins_18_20.csv", index = False ); clk_18_20.to_csv( loc_to_save + "\\clk_18_20.csv", index = False ); 
ins_21_23.to_csv( loc_to_save + "\\ins_21_23.csv", index = False ); clk_21_23.to_csv( loc_to_save + "\\clk_21_23.csv", index = False ); 
ins_24_26.to_csv( loc_to_save + "\\ins_24_26.csv", index = False ); clk_24_26.to_csv( loc_to_save + "\\clk_24_26.csv", index = False ); 

In [17]:
auc_18_20.to_csv( loc_to_save + "\\auc_18_20.csv", index = False ); evt_18_20.to_csv( loc_to_save + "\\evt_18_20.csv", index = False ); 
auc_21_23.to_csv( loc_to_save + "\\auc_21_23.csv", index = False ); evt_21_23.to_csv( loc_to_save + "\\evt_21_23.csv", index = False ); 
auc_24_26.to_csv( loc_to_save + "\\auc_24_26.csv", index = False ); evt_24_26.to_csv( loc_to_save + "\\evt_24_26.csv", index = False ); 

# Genero los labels

### Auc

In [18]:
auc_18_20_min_st = auc_18_20.groupby( by = ["device_id"] ).agg( {"_st":"min"} ).reset_index(); auc_18_20_min_st["ventana"] = "18_20"
auc_21_23_min_st = auc_21_23.groupby( by = ["device_id"] ).agg( {"_st":"min"} ).reset_index(); auc_21_23_min_st["ventana"] = "21_23"
auc_24_26_min_st = auc_24_26.groupby( by = ["device_id"] ).agg( {"_st":"min"} ).reset_index(); auc_24_26_min_st["ventana"] = "24_26"

In [21]:
auc_18_26_min_st = auc_18_20_min_st.append( auc_21_23_min_st ).append( auc_24_26_min_st )
auc_18_26_min_st["ventana"] = auc_18_26_min_st["ventana"].astype( "category" )
auc_lb = auc_18_26_min_st[["device_id","ventana","_st"]].groupby( by = ["device_id", "ventana"] ).agg( "min" )
auc_lb = auc_lb.unstack( level = 1 )

In [22]:
auc_lb

Unnamed: 0_level_0,_st,_st,_st
ventana,18_20,21_23,24_26
device_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
40621409780134,,226857.222979,
41863526108385,157228.465866,,
69039685746313,,,126258.597103
135153013040192,187854.009137,,
161514654074162,10366.357746,,
168103949904656,,173044.526198,
181891380775191,256765.420614,,
186034136943920,60166.331894,88.851558,
283297668933729,80684.520583,,
295841792051458,,432.827168,


In [24]:
auc_lb.columns = ["18_20_st", "21_23_st", "24_26_st"]
auc_lb = auc_lb.reset_index()
auc_lb.columns = ["ref_hash","18_20_st", "21_23_st", "24_26_st"] #le cambio a ref_hash asi coincide con los demas df

### Ins

In [25]:
ins_18_20_min_st = ins_18_20.groupby( by = ["ref_hash"] ).agg( {"_sc":"min"} ).reset_index(); ins_18_20_min_st["ventana"] = "18_20"
ins_21_23_min_st = ins_21_23.groupby( by = ["ref_hash"] ).agg( {"_sc":"min"} ).reset_index(); ins_21_23_min_st["ventana"] = "21_23"
ins_24_26_min_st = ins_24_26.groupby( by = ["ref_hash"] ).agg( {"_sc":"min"} ).reset_index(); ins_24_26_min_st["ventana"] = "24_26"

In [28]:
ins_18_26_min_st = ins_18_20_min_st.append( ins_21_23_min_st ).append( ins_24_26_min_st )
ins_18_26_min_st["ventana"] = ins_18_26_min_st["ventana"].astype( "category" )
ins_lb = ins_18_26_min_st[ ["ref_hash","ventana","_sc"] ].groupby( by = ["ref_hash", "ventana"] ).agg( "min" )
ins_lb = ins_lb.unstack( level = 1 )
ins_lb.columns = ["18_20_sc", "21_23_sc", "24_26_sc"]
ins_lb = ins_lb.reset_index()
ins_lb.columns = ["ref_hash","18_20_sc", "21_23_sc", "24_26_sc"]

# Censuramos los Labels 

In [29]:
#Creamos una columna con 1 si el evento sucedio, 0 si no, para cada ventana de tiempo
ins_lb["obs_18_20"] = ins_lb["18_20_sc"].apply( lambda x: 0 if np.isnan( x ) else 1 )
ins_lb["obs_21_23"] = ins_lb["21_23_sc"].apply( lambda x: 0 if np.isnan( x ) else 1 )
ins_lb["obs_24_26"] = ins_lb["24_26_sc"].apply( lambda x: 0 if np.isnan( x ) else 1 )

auc_lb["obs_18_20"] = auc_lb["18_20_st"].apply( lambda x: 0 if np.isnan( x ) else 1 )
auc_lb["obs_21_23"] = auc_lb["21_23_st"].apply( lambda x: 0 if np.isnan( x ) else 1 )
auc_lb["obs_24_26"] = auc_lb["24_26_st"].apply( lambda x: 0 if np.isnan( x ) else 1 )

In [30]:
# Completamos los nans con la maxima duracion del evento = 3 dias = 259200 segundos
seg_3_dias = 24 * 3 * 3600
auc_lb.fillna( seg_3_dias, inplace = True )
ins_lb.fillna( seg_3_dias, inplace = True )

# Guardamos los Labels

In [31]:
loc_save_lb = r"D:\TP2_Machine_Learning\Labels"

In [32]:
ins_lb.to_csv( loc_save_lb + "\\ins_lb.csv", index = False )
auc_lb.to_csv( loc_save_lb + "\\auc_lb.csv", index = False )