# 2- Data preprocessing e ingeniería de características

Ingeniería de características. Estrategias y métodos a analizar en este notebook:

- Extracting and playing with dates

Referencias: https://github.com/dipanjanS

# Import necessary dependencies and settings

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import datetime
import numpy as np
import pandas as pd
from dateutil.parser import parse
import pytz

# Load and process sample temporal data

Creamos un conjunto de fechas de tipo 'Timestamp'

In [2]:
time_stamps = ['2017-03-08 10:30:00.360000+00:00', '2019-07-13 15:45:05.755000-07:00',
               '2015-01-20 22:30:00.254000+05:30', '2018-12-25 00:30:00.000000+10:00']
df = pd.DataFrame(time_stamps, columns=['Time'])
df

Unnamed: 0,Time
0,2017-03-08 10:30:00.360000+00:00
1,2019-07-13 15:45:05.755000-07:00
2,2015-01-20 22:30:00.254000+05:30
3,2018-12-25 00:30:00.000000+10:00


In [3]:
ts_objs = np.array([pd.Timestamp(item) for item in np.array(df.Time)])
df['TS_obj'] = ts_objs
ts_objs

array([Timestamp('2017-03-08 10:30:00.360000+0000', tz='UTC'),
       Timestamp('2019-07-13 15:45:05.755000-0700', tz='pytz.FixedOffset(-420)'),
       Timestamp('2015-01-20 22:30:00.254000+0530', tz='pytz.FixedOffset(330)'),
       Timestamp('2018-12-25 00:30:00+1000', tz='pytz.FixedOffset(600)')],
      dtype=object)

# Date based features

Extraemos nuevas características de tipo fecha a partir de la variable de tipo 'Timestamp': año, mes, día, etc.

In [4]:
df['Year'] = df['TS_obj'].apply(lambda d: d.year)
df['Month'] = df['TS_obj'].apply(lambda d: d.month)
df['Day'] = df['TS_obj'].apply(lambda d: d.day)
df['DayOfWeek'] = df['TS_obj'].apply(lambda d: d.dayofweek)
df['DayName'] = df['TS_obj'].apply(lambda d: d.weekday_name)
df['DayOfYear'] = df['TS_obj'].apply(lambda d: d.dayofyear)
df['WeekOfYear'] = df['TS_obj'].apply(lambda d: d.weekofyear)
df['Quarter'] = df['TS_obj'].apply(lambda d: d.quarter)

df[['Time', 'Year', 'Month', 'Day', 'Quarter', 
    'DayOfWeek', 'DayName', 'DayOfYear', 'WeekOfYear']]

Unnamed: 0,Time,Year,Month,Day,Quarter,DayOfWeek,DayName,DayOfYear,WeekOfYear
0,2017-03-08 10:30:00.360000+00:00,2017,3,8,1,2,Wednesday,67,10
1,2019-07-13 15:45:05.755000-07:00,2019,7,13,3,5,Saturday,194,28
2,2015-01-20 22:30:00.254000+05:30,2015,1,20,1,1,Tuesday,20,4
3,2018-12-25 00:30:00.000000+10:00,2018,12,25,4,1,Tuesday,359,52


# Time based features

Extraemos nuevas características de tipo tiempo a partir de la variable de tipo 'Timestamp': hora, minutos, segundos, etc.

In [5]:
df['Hour'] = df['TS_obj'].apply(lambda d: d.hour)
df['Minute'] = df['TS_obj'].apply(lambda d: d.minute)
df['Second'] = df['TS_obj'].apply(lambda d: d.second)
df['MUsecond'] = df['TS_obj'].apply(lambda d: d.microsecond)
df['UTC_offset'] = df['TS_obj'].apply(lambda d: d.utcoffset())

df[['Time', 'Hour', 'Minute', 'Second', 'MUsecond', 'UTC_offset']]

Unnamed: 0,Time,Hour,Minute,Second,MUsecond,UTC_offset
0,2017-03-08 10:30:00.360000+00:00,10,30,0,360000,00:00:00
1,2019-07-13 15:45:05.755000-07:00,15,45,5,755000,-1 days +17:00:00
2,2015-01-20 22:30:00.254000+05:30,22,30,0,254000,05:30:00
3,2018-12-25 00:30:00.000000+10:00,0,30,0,0,10:00:00


In [6]:
hour_bins = [-1, 5, 11, 16, 21, 23]
bin_names = ['Late Night', 'Morning', 'Afternoon', 'Evening', 'Night']
df['TimeOfDayBin'] = pd.cut(df['Hour'], 
                            bins=hour_bins, labels=bin_names)
df[['Time', 'Hour', 'TimeOfDayBin']]

Unnamed: 0,Time,Hour,TimeOfDayBin
0,2017-03-08 10:30:00.360000+00:00,10,Morning
1,2019-07-13 15:45:05.755000-07:00,15,Afternoon
2,2015-01-20 22:30:00.254000+05:30,22,Night
3,2018-12-25 00:30:00.000000+10:00,0,Late Night


In [7]:
df['TZ_info'] = df['TS_obj'].apply(lambda d: d.tzinfo)
df['TimeZones'] = df['TS_obj'].apply(lambda d: list({d.astimezone(tz).tzname() 
                                   for tz in map(pytz.timezone, 
                                                 pytz.all_timezones_set)
                                       if d.astimezone(tz).utcoffset() == d.utcoffset()}))

df[['Time', 'UTC_offset', 'TZ_info', 'TimeZones']]

Unnamed: 0,Time,UTC_offset,TZ_info,TimeZones
0,2017-03-08 10:30:00.360000+00:00,00:00:00,UTC,"[+00, GMT, WET, UTC]"
1,2019-07-13 15:45:05.755000-07:00,-1 days +17:00:00,pytz.FixedOffset(-420),"[PDT, -07, MST]"
2,2015-01-20 22:30:00.254000+05:30,05:30:00,pytz.FixedOffset(330),"[+0530, IST]"
3,2018-12-25 00:30:00.000000+10:00,10:00:00,pytz.FixedOffset(600),"[AEST, ChST, +10]"


In [8]:
df['TimeUTC'] = df['TS_obj'].apply(lambda d: d.tz_convert(pytz.utc))
df['Epoch'] = df['TimeUTC'].apply(lambda d: d.timestamp())
df['GregOrdinal'] = df['TimeUTC'].apply(lambda d: d.toordinal())

df[['Time', 'TimeUTC', 'Epoch', 'GregOrdinal']]

Unnamed: 0,Time,TimeUTC,Epoch,GregOrdinal
0,2017-03-08 10:30:00.360000+00:00,2017-03-08 10:30:00.360000+00:00,1488969000.0,736396
1,2019-07-13 15:45:05.755000-07:00,2019-07-13 22:45:05.755000+00:00,1563058000.0,737253
2,2015-01-20 22:30:00.254000+05:30,2015-01-20 17:00:00.254000+00:00,1421773000.0,735618
3,2018-12-25 00:30:00.000000+10:00,2018-12-24 14:30:00+00:00,1545662000.0,737052


In [9]:
curr_ts = datetime.datetime.now(pytz.utc)
# compute days elapsed since today
df['DaysElapsedEpoch'] = (curr_ts.timestamp() - df['Epoch']) / (3600*24)
df['DaysElapsedOrdinal'] = (curr_ts.toordinal() - df['GregOrdinal']) 

df[['Time', 'TimeUTC', 'DaysElapsedEpoch', 'DaysElapsedOrdinal']]

Unnamed: 0,Time,TimeUTC,DaysElapsedEpoch,DaysElapsedOrdinal
0,2017-03-08 10:30:00.360000+00:00,2017-03-08 10:30:00.360000+00:00,1149.973269,1150
1,2019-07-13 15:45:05.755000-07:00,2019-07-13 22:45:05.755000+00:00,292.46279,293
2,2015-01-20 22:30:00.254000+05:30,2015-01-20 17:00:00.254000+00:00,1927.702437,1928
3,2018-12-25 00:30:00.000000+10:00,2018-12-24 14:30:00+00:00,493.806607,494
