# 03 – Ingeniería de Variables (Feature Engineering)
Con las ubicaciones asociadas a los pings, crear ingeniería de variables agrupando los datos por ubicación

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW pings_ext AS
SELECT *,
       hour(datetime) AS hour,
       to_date(datetime) AS date,
       dayofweek(datetime) AS dow,
       CASE 
           WHEN hour BETWEEN 6 AND 11 THEN 'mañana'
           WHEN hour BETWEEN 12 AND 17 THEN 'tarde'
           WHEN hour BETWEEN 18 AND 23 THEN 'noche'
           ELSE 'madrugada'
       END AS time_block,
       CASE WHEN dayofweek(datetime) IN (1,7) THEN 1 ELSE 0 END AS is_weekend
FROM sv_pings_georreferenciados_partitioned

### Footfall: Número de Personas que entran a una tienda
Promedio por día de la semana

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW footfall_weekday_raw AS
SELECT
  osm_id,
  dow,
  COUNT(DISTINCT device_id || date) AS visitas_unicas
FROM pings_ext
GROUP BY 1,2;


CREATE OR REPLACE TEMP VIEW footfall_weekday AS
SELECT
  osm_id,
  avg(case when dow = 1 then visitas_unicas else null end) footfall_domingo,
  avg(case when dow = 2 then visitas_unicas else null end) footfall_lunes,
  avg(case when dow = 3 then visitas_unicas else null end) footfall_martes,
  avg(case when dow = 4 then visitas_unicas else null end) footfall_miercoles,
  avg(case when dow = 5 then visitas_unicas else null end) footfall_jueves,
  avg(case when dow = 6 then visitas_unicas else null end) footfall_viernes,
  avg(case when dow = 7 then visitas_unicas else null end) footfall_sabado
FROM footfall_weekday_raw
GROUP BY 1;

Promedio por franja horaria

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW footfall_block_raw AS
SELECT
  osm_id,
  time_block,
  is_weekend,
  COUNT(DISTINCT device_id || date) AS visitas_unicas
FROM pings_ext
GROUP BY 1,2,3;

CREATE OR REPLACE TEMP VIEW footfall_block AS
SELECT
  osm_id,
  avg(case when is_weekend=1 and time_block = 'mañana' then visitas_unicas else null end) as footfall_manana_weekend,
  avg(case when is_weekend=1 and time_block = 'tarde' then visitas_unicas else null end) as footfall_tarde_weekend,
  avg(case when is_weekend=1 and time_block = 'noche' then visitas_unicas else null end) as footfall_noche_weekend,
  avg(case when is_weekend=1 and time_block = 'madrugada' then visitas_unicas else null end) as footfall_madrugada_weekend,
  avg(case when is_weekend=0 and time_block = 'mañana' then visitas_unicas else null end) as footfall_manana_semana,
  avg(case when is_weekend=0 and time_block = 'tarde' then visitas_unicas else null end) as footfall_tarde_semana,
  avg(case when is_weekend=0 and time_block = 'noche' then visitas_unicas else null end) as footfall_noche_semana,
  avg(case when is_weekend=0 and time_block = 'madrugada' then visitas_unicas else null end) as footfall_madrugada_semana,
  avg(visitas_unicas) AS footfall_total
FROM footfall_block_raw
GROUP BY 1;

### Dwell: Tiempo de Permanencia promedio
Promedio por día de la semana

In [0]:
%sql
--caso de ejemplo
select * 
from pings_ext 
where date='2023-12-15'
  and device_id='bbc86e3b-ae75-499e-a675-8308cad3158d'
  and osm_id='371844168'
order by datetime asc
--osm_id='1108623425' and name ilike '%Campero%'

In [0]:
%sql
--calculando ubicacion previa para determinar si sigue o no en el mismo local
CREATE OR REPLACE TEMP VIEW pings_seq AS
SELECT *,
       LAG(osm_id) OVER (PARTITION BY device_id ORDER BY datetime) AS prev_osm
FROM pings_ext;

--estableciendo flag si ha cambiado de local o sigue en el mismo
CREATE OR REPLACE TEMP VIEW pings_flag AS
SELECT *,
       CASE WHEN osm_id != prev_osm THEN 1 ELSE 0 END AS new_sequence
FROM pings_seq;

--creando id de la secuencia del lugar donde ha estado, registros consecutivos mantendran el mismo id 1+0 de acuerdo al flag, registros no consecutivos iran teniendo un id difernete 1+1=2
CREATE OR REPLACE TEMP VIEW pings_grouped AS
SELECT *,
       SUM(new_sequence) OVER (PARTITION BY device_id ORDER BY datetime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS seq_id
FROM pings_flag;

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW dwell_time_seq AS
SELECT
  osm_id,
  device_id,
  MIN(datetime) AS inicio,
  MAX(datetime) AS fin,
  UNIX_TIMESTAMP(MAX(datetime)) - UNIX_TIMESTAMP(MIN(datetime)) AS dwell_seconds,
  COUNT(*) AS num_pings
FROM pings_grouped
GROUP BY osm_id, device_id, seq_id
HAVING COUNT(*) > 1;  -- Excluir visitas con solo un ping

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW dwell_stats_seq AS
SELECT
  osm_id,
  AVG(dwell_seconds) AS avg_dwell,
  MAX(dwell_seconds) AS max_dwell,
  MIN(dwell_seconds) AS min_dwell,
  AVG(num_pings) AS avg_pings_por_visita
FROM dwell_time_seq
GROUP BY osm_id;

### Repeat Rate: Número de veces que un mismo dispositivo va a la tienda 


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW repeat_daily AS
SELECT osm_id, device_id, COUNT(DISTINCT date) AS dias_diferentes
FROM pings_ext
GROUP BY osm_id, device_id;


CREATE OR REPLACE TEMP VIEW repeat_rate_daily AS
SELECT osm_id, AVG(dias_diferentes) AS repeat_visit_rate_daily
FROM repeat_daily
GROUP BY osm_id;

### Peak Hour: Hora Pico por tipo de día (semana o fin de semana)

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW peak_hour_base AS
SELECT
  osm_id,
  hour,
  is_weekend,
  COUNT(DISTINCT device_id) AS visitas
FROM pings_ext
GROUP BY osm_id, hour, is_weekend;

CREATE OR REPLACE TEMP VIEW peak_hour_ranked AS
SELECT *,
       ROW_NUMBER() OVER (PARTITION BY osm_id, is_weekend ORDER BY visitas DESC) AS rnk
FROM peak_hour_base;

CREATE OR REPLACE TEMP VIEW peak_hours AS
SELECT
  osm_id,
  MAX(CASE WHEN is_weekend = 0 AND rnk = 1 THEN hour END) AS peak_hour_weekday,
  MAX(CASE WHEN is_weekend = 0 AND rnk = 1 THEN visitas END) AS peak_visits_weekday,
  MAX(CASE WHEN is_weekend = 1 AND rnk = 1 THEN hour END) AS peak_hour_weekend,
  MAX(CASE WHEN is_weekend = 1 AND rnk = 1 THEN visitas END) AS peak_visits_weekend
FROM peak_hour_ranked
GROUP BY osm_id;

### Dispositivos Unicos

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW total_devices AS
SELECT osm_id, COUNT(DISTINCT device_id) AS total_devices
FROM pings_ext
GROUP BY osm_id;

### Entropía Horaria, aproximada como número de horas con tráfico
Una tienda con pocas horas con trafico tendra las visitas concentradas en pocas horas y por lo contrario estaran distribuidas uniformemente

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW horas_activas AS
SELECT
  osm_id,
  MAX(CASE WHEN is_weekend = 0 THEN horas END) AS horas_con_movimiento_semana,
  MAX(CASE WHEN is_weekend = 1 THEN horas END) AS horas_con_movimiento_finsemana
FROM (
  SELECT osm_id, is_weekend, COUNT(DISTINCT hour) AS horas
  FROM pings_ext
  GROUP BY osm_id, is_weekend
) sub
GROUP BY osm_id;

### Union final de todas las metricas realizadas a tabla final

In [0]:
%sql
select * from pings_ext limit 4

In [0]:
%sql
create or replace temp view pings_ext_locations as
select distinct pings.osm_id,
       pings.fclass,
       pings.name,
       pings.source_layer
from pings_ext pings
where name is not null;

In [0]:
%sql
CREATE OR REPLACE TABLE sv_202312_metricas_locales AS
select pings.osm_id,
       pings.fclass,
       pings.name,
       pings.source_layer,
       ifnull(fw.footfall_domingo, 0) footfall_domingo, 
       ifnull(fw.footfall_lunes, 0) footfall_lunes, 
       ifnull(fw.footfall_martes, 0) footfall_martes, 
       ifnull(fw.footfall_miercoles, 0) footfall_miercoles, 
       ifnull(fw.footfall_jueves, 0) footfall_jueves, 
       ifnull(fw.footfall_viernes, 0) footfall_viernes, 
       ifnull(fw.footfall_sabado, 0) footfall_sabado, 
       ifnull(fb.footfall_manana_weekend, 0) footfall_manana_weekend, 
       ifnull(fb.footfall_tarde_weekend, 0) footfall_tarde_weekend, 
       ifnull(fb.footfall_noche_weekend, 0) footfall_noche_weekend, 
       ifnull(fb.footfall_madrugada_weekend, 0) footfall_madrugada_weekend, 
       ifnull(fb.footfall_manana_semana, 0) footfall_manana_semana, 
       ifnull(fb.footfall_tarde_semana, 0) footfall_tarde_semana, 
       ifnull(fb.footfall_noche_semana, 0) footfall_noche_semana, 
       ifnull(fb.footfall_madrugada_semana, 0) footfall_madrugada_semana, 
       ifnull(round(ds.avg_dwell,0), 0) avg_dwell, 
       ifnull(ds.max_dwell, 0) max_dwell, 
       ifnull(ds.min_dwell, 0) min_dwell, 
       ifnull(round(rr.repeat_visit_rate_daily,2),0) repeat_visit_rate_daily, 
       ifnull(ph.peak_hour_weekday, -1) peak_hour_weekday, 
       ifnull(ph.peak_visits_weekday, 0) peak_visits_weekday, 
       ifnull(ph.peak_hour_weekend, -1) peak_hour_weekend, 
       ifnull(ph.peak_visits_weekend, 0) peak_visits_weekend, 
       ifnull(td.total_devices, 0) total_devices, 
       ifnull(hh.horas_con_movimiento_semana, 0) horas_con_movimiento_semana, 
       ifnull(hh.horas_con_movimiento_finsemana,0) horas_con_movimiento_finsemana
from pings_ext_locations pings
left join footfall_weekday fw on pings.osm_id = fw.osm_id
left join footfall_block fb on pings.osm_id = fb.osm_id
left join dwell_stats_seq ds on pings.osm_id = ds.osm_id
left join repeat_rate_daily rr on pings.osm_id = rr.osm_id
left join peak_hours ph on pings.osm_id = ph.osm_id
left join total_devices td on pings.osm_id = td.osm_id
left join horas_activas hh on pings.osm_id = hh.osm_id
;

In [0]:
%sql
select count(1) from sv_202312_metricas_locales

In [0]:
df_pandas = spark.table("sv_202312_metricas_locales").toPandas()

In [0]:
import csv
df_pandas.to_csv("./sv_202312_metricas_locales.csv",encoding='utf-8', index=False, quoting=csv.QUOTE_ALL)