In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product

### Read in the data

In [2]:
device_activations = pd.read_csv('data/device_activations.csv')

In [3]:
device_activations.head()

Unnamed: 0,time,device,device_activated
0,2016-07-01 04:23:32,device_6,1
1,2016-07-01 06:52:57,device_2,1
2,2016-07-01 06:53:00,device_2,1
3,2016-07-01 06:56:41,device_2,1
4,2016-07-01 07:00:01,device_6,1


### Inspect & Clean the data

In [4]:
device_activations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9045 entries, 0 to 9044
Data columns (total 3 columns):
time                9045 non-null object
device              9045 non-null object
device_activated    9045 non-null int64
dtypes: int64(1), object(2)
memory usage: 212.1+ KB


In [5]:
device_activations.time = pd.to_datetime(device_activations.time)

In [6]:
device_activations.device_activated.value_counts()

1    9045
Name: device_activated, dtype: int64

In [7]:
device_activations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9045 entries, 0 to 9044
Data columns (total 3 columns):
time                9045 non-null datetime64[ns]
device              9045 non-null object
device_activated    9045 non-null int64
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 212.1+ KB


* Create a new column corresponding to the hour since start of data

In [9]:
earliest_date = min(device_activations.time).date()

device_activations['date'] = device_activations['time'].dt.date
device_activations['hour_of_day'] = device_activations['time'].dt.hour
device_activations['hour_of_week'] = device_activations['time'].dt.dayofweek * 24 + device_activations['hour_of_day']
device_activations['hour_slot'] = (device_activations['date'] - earliest_date).dt.days * 24 + device_activations['hour_of_day']

In [10]:
device_activations.head()

Unnamed: 0,time,device,device_activated,date,hour_of_day,hour_of_week,hour_slot
0,2016-07-01 04:23:32,device_6,1,2016-07-01,4,100,4
1,2016-07-01 06:52:57,device_2,1,2016-07-01,6,102,6
2,2016-07-01 06:53:00,device_2,1,2016-07-01,6,102,6
3,2016-07-01 06:56:41,device_2,1,2016-07-01,6,102,6
4,2016-07-01 07:00:01,device_6,1,2016-07-01,7,103,7


In [11]:
# Create a blank dataset for every hour and device
all_devices = list(device_activations.device.unique())
n_hours = ((max(device_activations.time) - min(device_activations.time)).days+1)*24
hour_slot = list(range(n_hours))

blank_df = pd.DataFrame(list(product(all_devices, hour_slot)), columns=['device', 'hour_slot'])
blank_df['hour_of_day'] = blank_df.hour_slot % 24
blank_df['hour_of_week'] = blank_df.hour_slot % (7*24)
blank_df['day_num'] = np.floor(blank_df.hour_slot/24).astype(int)
blank_df['week_num'] = np.floor(blank_df.hour_slot/(24*7)).astype(int)
blank_df['day_of_week'] = np.floor(blank_df.hour_of_week/7).astype(int)
blank_df.head()

Unnamed: 0,device,hour_slot,hour_of_day,hour_of_week,day_num,week_num,day_of_week
0,device_6,0,0,0,0,0,0
1,device_6,1,1,1,0,0,0
2,device_6,2,2,2,0,0,0
3,device_6,3,3,3,0,0,0
4,device_6,4,4,4,0,0,0


In [12]:
#Cleanup the activation data in order to join with the blank data
agg_dict = {
    'time': {
        'earliest_activation': min,
        'latest_activation': max
        },
    'device_activated': sum
}

grouped_df = device_activations.groupby(['device', 'hour_slot']).agg(agg_dict).reset_index()
grouped_df.columns = grouped_df.columns.droplevel(0)
grouped_df.columns = [
    'device',
    'hour_slot',
    'earliest_activation',
    'latest_activation',
    'total_activations'
]
grouped_df.head()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,device,hour_slot,earliest_activation,latest_activation,total_activations
0,device_1,592,2016-07-25 16:09:37,2016-07-25 16:57:39,6
1,device_1,607,2016-07-26 07:31:08,2016-07-26 07:52:41,2
2,device_1,608,2016-07-26 08:02:51,2016-07-26 08:56:42,6
3,device_1,609,2016-07-26 09:15:25,2016-07-26 09:49:18,4
4,device_1,610,2016-07-26 10:01:39,2016-07-26 10:59:32,10


In [13]:
grouped_df.shape

(1967, 5)

In [14]:
#Join with the activation data and cleanup
df = pd.merge(blank_df, grouped_df, how='left')
df['is_active'] = (df.total_activations.isna() == False).astype(int)
df.head()

Unnamed: 0,device,hour_slot,hour_of_day,hour_of_week,day_num,week_num,day_of_week,earliest_activation,latest_activation,total_activations,is_active
0,device_6,0,0,0,0,0,0,NaT,NaT,,0
1,device_6,1,1,1,0,0,0,NaT,NaT,,0
2,device_6,2,2,2,0,0,0,NaT,NaT,,0
3,device_6,3,3,3,0,0,0,NaT,NaT,,0
4,device_6,4,4,4,0,0,0,2016-07-01 04:23:32,2016-07-01 04:23:32,1.0,1


In [17]:
n_devices = len(all_devices)
fig, axs = plt.subplots(nrows = n_devices, figsize = (12, 12*n_devices))

for i, device in zip(range(n_devices), all_devices):
    sns.heatmap(
        df[df.device == device].pivot("hour_of_day", "day_num", "is_active"), 
        ax=axs[i]
    )

In [16]:
n_devices = len(all_devices)
fig, axs = plt.subplots(nrows = n_devices, figsize = (12, 12*n_devices))

for i, device in zip(range(n_devices), all_devices):
    sns.heatmap(
        df[df.device == device].pivot("hour_of_week", "week_num", "is_active"), 
        ax=axs[i]
    )

In [18]:
df.head()

Unnamed: 0,device,hour_slot,hour_of_day,hour_of_week,day_num,week_num,day_of_week,earliest_activation,latest_activation,total_activations,is_active
0,device_6,0,0,0,0,0,0,NaT,NaT,,0
1,device_6,1,1,1,0,0,0,NaT,NaT,,0
2,device_6,2,2,2,0,0,0,NaT,NaT,,0
3,device_6,3,3,3,0,0,0,NaT,NaT,,0
4,device_6,4,4,4,0,0,0,2016-07-01 04:23:32,2016-07-01 04:23:32,1.0,1


# Create the training data

Was the device active this time last week?

In [35]:
#Select only the required columns
active_last_week_df = df[['device', 'hour_slot', 'hour_of_week', 'week_num']]

#Create a lag_week_num variable to join on
active_last_week_df['lag_week_num'] = active_last_week_df['week_num'] - 1

#Join with last weeks data
active_last_week_df = pd.merge(
    active_last_week_df,
    df[['device', 'hour_of_week', 'week_num', 'is_active']],
    how = 'left',
    left_on = ['device', 'hour_of_week', 'lag_week_num'],
    right_on = ['device', 'hour_of_week', 'week_num']
)

#Clean up
active_last_week_df = active_last_week_df[['device', 'hour_slot', 'is_active']]
active_last_week_df.columns = ['device', 'hour_slot', 'is_active_last_week']
active_last_week_df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,device,hour_slot,is_active_last_week
9895,device_1,967,1.0
4144,device_4,1168,0.0
5420,device_3,956,1.0
600,device_6,600,0.0
6073,device_5,121,


Was the device active this time yesterday?

In [37]:
#Select only the required columns
active_yesterday_df = df[['device', 'hour_slot', 'hour_of_day', 'day_num']]

#Create a lag_day_num variable to join on
active_yesterday_df['lag_day_num'] = active_yesterday_df['day_num'] - 1

#Join with last weeks data
active_yesterday_df = pd.merge(
    active_yesterday_df,
    df[['device', 'hour_of_day', 'day_num', 'is_active']],
    how = 'left',
    left_on = ['device', 'hour_of_day', 'lag_day_num'],
    right_on = ['device', 'hour_of_day', 'day_num']
)

#Clean up
active_yesterday_df = active_yesterday_df[['device', 'hour_slot', 'is_active']]
active_yesterday_df.columns = ['device', 'hour_slot', 'is_active_yesterday']
active_yesterday_df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,device,hour_slot,is_active_yesterday
10195,device_1,1267,0.0
2246,device_2,758,0.0
7731,device_7,291,0.0
1880,device_2,392,0.0
9084,device_1,156,0.0


* What's the average activation rate for this room at this time of the day

(Potentially could try normalize within week or something like that?)

In [76]:
# Sort values by hour_slot
df.sort_values(by='hour_slot', inplace=True, ascending=True)

#Select only the required columns
hour_of_day_activation_rate_df = df[['device', 'hour_slot', 'hour_of_day', 'day_num', 'is_active']]

#Create a custom aggregation function to calculate the mean before this value
def mean_pre_now(x):
    return np.mean(x[:-1])

agg_dict = {
    'is_active': mean_pre_now,
    'hour_slot': max
}

# Group by device, hour_of_day
hour_of_day_activation_rate_df = hour_of_day_activation_rate_df.\
                                groupby(['device', 'hour_of_day']).\
                                expanding().\
                                agg(agg_dict)[['is_active', 'hour_slot']].\
                                reset_index(drop=False)

hour_of_day_activation_rate_df = hour_of_day_activation_rate_df[['device', 'hour_slot', 'is_active']]
hour_of_day_activation_rate_df.columns = ['device', 'hour_slot', 'daily_activation_rate']
hour_of_day_activation_rate_df.sample(10)

  out=out, **kwargs)


Unnamed: 0,device,hour_slot,daily_activation_rate
8396,device_6,639.0,0.653846
8204,device_6,492.0,0.65
9288,device_7,1205.0,0.0
4768,device_4,1348.0,0.0
3053,device_3,361.0,0.0
10280,device_7,1221.0,0.0
8373,device_6,87.0,0.0
4052,device_3,545.0,0.136364
9079,device_7,650.0,0.0
4327,device_3,1197.0,0.020408


* What's the average activation rate for this room at this time of the week


In [75]:
# Sort values by hour_slot
df.sort_values(by='hour_slot', inplace=True, ascending=True)

#Select only the required columns
weekly_activation_rate_df = df[['device', 'hour_slot', 'hour_of_week', 'week_num', 'is_active']]

#Create a custom aggregation function to calculate the mean before this value
def mean_pre_now(x):
    return np.mean(x[:-1])

agg_dict = {
    'is_active': mean_pre_now,
    'hour_slot': max
}

# Group by device, hour_of_day
weekly_activation_rate_df = weekly_activation_rate_df.\
                                groupby(['device', 'hour_of_week']).\
                                expanding().\
                                agg(agg_dict)[['is_active', 'hour_slot']].\
                                reset_index(drop=False)

weekly_activation_rate_df = weekly_activation_rate_df[['device', 'hour_slot', 'is_active']]
weekly_activation_rate_df.columns = ['device', 'hour_slot', 'weekly_activation_rate']
weekly_activation_rate_df.sample(10)

  out=out, **kwargs)


Unnamed: 0,device,hour_slot,weekly_activation_rate
1857,device_2,41.0,
1031,device_1,954.0,0.0
7161,device_5,638.0,0.666667
9879,device_7,1113.0,0.0
8519,device_6,1463.0,0.0
4951,device_4,222.0,0.0
7296,device_5,150.0,
3636,device_3,577.0,0.0
5287,device_4,763.0,0.0
2926,device_2,1169.0,0.5


* What's the average activation rate for this room for the last week

In [88]:
#Get average daily activations for each device
weekly_device_activation_rate_df = df.groupby(['device', 'day_num'])\
                                .agg(np.mean)['is_active']\
                                .reset_index(drop=False)
weekly_device_activation_rate_df.columns = ['device', 'day_num', 'activation_rate']

weekly_device_activation_rate_df.sort_values('day_num', ascending = True)
weekly_device_activation_rate_df['weeks_activation_rate'] = weekly_device_activation_rate_df\
                                                        .groupby('device')['activation_rate']\
                                                        .rolling(7).mean()\
                                                        .reset_index(drop=True)

#Add 1 to the day for joining with original df
weekly_device_activation_rate_df['lead_day_num'] = weekly_device_activation_rate_df['day_num'] + 1
            
weekly_device_activation_rate_df = pd.merge(
    df[['device', 'hour_slot', 'day_num']],
    weekly_device_activation_rate_df,
    how = 'left',
    left_on = ['device', 'day_num'],
    right_on = ['device', 'lead_day_num']
)

#Clean up
keep_cols = [
    'device',
    'hour_slot',
    'activation_rate',
    'weeks_activation_rate'
    ]
weekly_device_activation_rate_df = weekly_device_activation_rate_df[keep_cols]
weekly_device_activation_rate_df.columns = [
    'device',
    'hour_slot',
    'yesterdays_device_activation_rate',
    'last_weeks_device_activation_rate'
]

weekly_device_activation_rate_df.sample(10)

Unnamed: 0,device,hour_slot,yesterdays_device_activation_rate,last_weeks_device_activation_rate
3403,device_2,486,0.208333,0.22619
5453,device_3,779,0.041667,0.166667
1612,device_7,230,0.0,0.142857
7296,device_1,1042,0.333333,0.214286
1045,device_3,149,0.25,
10196,device_3,1456,0.375,0.208333
3904,device_6,557,0.0,0.369048
7064,device_1,1009,0.375,0.166667
7370,device_3,1052,0.25,0.107143
2460,device_1,351,0.0,0.0


* What's the average activation rate for all rooms for the last week

In [95]:
#Get average daily activations for each device
weekly_all_device_activation_rate_df = df.groupby('day_num')\
                                .agg(np.mean)['is_active']\
                                .reset_index(drop=False)
weekly_all_device_activation_rate_df.columns = ['day_num', 'activation_rate']

weekly_all_device_activation_rate_df.sort_values('day_num', ascending = True)
weekly_all_device_activation_rate_df['weeks_activation_rate'] = weekly_all_device_activation_rate_df['activation_rate']\
                                                        .rolling(7).mean()\
                                                        .reset_index(drop=True)

#Add 1 to the day for joining with original df
weekly_all_device_activation_rate_df['lead_day_num'] = weekly_all_device_activation_rate_df['day_num'] + 1
            
weekly_all_device_activation_rate_df = pd.merge(
    df[['device', 'hour_slot', 'day_num']],
    weekly_all_device_activation_rate_df,
    how = 'left',
    left_on = 'day_num',
    right_on = 'lead_day_num'
)

#Clean up
keep_cols = [
    'device',
    'hour_slot',
    'activation_rate',
    'weeks_activation_rate'
    ]
weekly_all_device_activation_rate_df = weekly_all_device_activation_rate_df[keep_cols]
weekly_all_device_activation_rate_df.columns = [
    'device',
    'hour_slot',
    'yesterdays_all_device_activation_rate',
    'last_weeks_all_device_activation_rate'
]

weekly_all_device_activation_rate_df.sample(10)


Unnamed: 0,device,hour_slot,yesterdays_all_device_activation_rate,last_weeks_all_device_activation_rate
4973,device_5,710,0.136905,0.181122
4964,device_2,709,0.136905,0.181122
8997,device_2,1285,0.208333,0.173469
268,device_4,38,0.14881,
7574,device_4,1082,0.005952,0.192177
8573,device_7,1224,0.0,0.144558
5080,device_1,725,0.0,0.181122
1184,device_5,169,0.375,0.204932
3547,device_3,506,0.267857,0.178571
8531,device_3,1218,0.267857,0.144558


### Join all of the dataframes together into our full modeling dataset

In [104]:
#Add in whether the device was active this time last week
modeling_df = pd.merge(
    df,
    active_last_week_df,
    how = 'left',
    left_on = ['device', 'hour_slot'],
    right_on = ['device', 'hour_slot']  
)

#Add in whether the device was active this time yesterday
modeling_df = pd.merge(
    df,
    active_yesterday_df,
    how = 'left',
    left_on = ['device', 'hour_slot'],
    right_on = ['device', 'hour_slot']  
)

#Add in average previous activation rate for this time of day & day of week
modeling_df = pd.merge(
    df,
    weekly_activation_rate_df,
    how = 'left',
    left_on = ['device', 'hour_slot'],
    right_on = ['device', 'hour_slot']  
)

#Add in average previous activation rate for this time of day & day of week
modeling_df = pd.merge(
    df,
    weekly_activation_rate_df,
    how = 'left',
    left_on = ['device', 'hour_slot'],
    right_on = ['device', 'hour_slot']  
)

#Add in average previous activation rate for this device for yesterday & last week
modeling_df = pd.merge(
    df,
    weekly_device_activation_rate_df,
    how = 'left',
    left_on = ['device', 'hour_slot'],
    right_on = ['device', 'hour_slot']  
)

#Add in average previous activation rate for all device for yesterday & last week
modeling_df = pd.merge(
    df,
    weekly_all_device_activation_rate_df,
    how = 'left',
    left_on = ['device', 'hour_slot'],
    right_on = ['device', 'hour_slot']  
)





modeling_df.sample(10)


Unnamed: 0,device,hour_slot,hour_of_day,hour_of_week,day_num,week_num,day_of_week,earliest_activation,latest_activation,total_activations,is_active,yesterdays_all_device_activation_rate,last_weeks_all_device_activation_rate
8635,device_3,1233,9,57,51,7,8,NaT,NaT,,0,0.0,0.144558
4151,device_2,593,17,89,24,3,12,2016-07-25 17:19:02,2016-07-25 17:31:38,5.0,1,0.0,0.181122
10,device_4,1,1,1,0,0,0,NaT,NaT,,0,,
2083,device_5,297,9,129,12,1,18,2016-07-13 09:10:59,2016-07-13 09:53:39,6.0,1,0.27381,0.217687
149,device_6,21,21,21,0,0,3,NaT,NaT,,0,,
1729,device_6,247,7,79,10,1,11,2016-07-11 07:04:20,2016-07-11 07:53:27,4.0,1,0.0,0.217687
10213,device_3,1459,19,115,60,8,16,NaT,NaT,,0,0.369048,0.191327
3205,device_2,457,1,121,19,2,17,NaT,NaT,,0,0.232143,0.188776
1782,device_1,254,14,86,10,1,12,NaT,NaT,,0,0.0,0.217687
1433,device_2,204,12,36,8,1,5,NaT,NaT,,0,0.244048,0.218537


### Create a class to build model(s) from a given config

In [113]:
from sklearn.ensemble import RandomForestClassifier

first_config = {
    'data': {
        'general': {
            'start_hour': 168,
            'end_hour': 1320,
            'n_cv_folds': 3,
            'test_perc': 0.2,
            'devices': 'all'
        },  
        'xy':{
            'y_col': 'is_active',
            'X_cols': [
                'device',
                'hour_of_day',
                'hour_of_week',
                'day_num',
                'week_num',
                'day_of_week',
                'yesterdays_all_device_activation_rate',
                'last_weeks_all_device_activation_rate'
            ]
        }
    },
    'model': {
        'model_class': RandomForestClassifier,
        'paramaters': {
            'n_estimators': 10,
            'max_depth': None,
            'min_samples_split': 2,
            'max_features': 'auto'
        }
    }
}

In [114]:
from sklearn.model_selection import train_test_split

class Model():
    '''
    This is the central class that will train my models
    '''
    def __init__(self, data_df, config):
        '''
        TODO: comment this whole class
        '''
        self.base_data = data_df
        self.config = config
                
        self._generate_data_splits(config['data'])
        self._train_model(config['model'])
        
        
        
    def _generate_data_splits(self, config):
        
        general_config = config['general']
        xy_config = config['xy']
        
        data = self.base_data
        rows = (data['hour_slot'] >= general_config['start_hour']) & (data['hour_slot'] <= general_config['end_hour'])
        data = data[rows]
        
        if general_config['devices'] != 'all':
            rows = data['device'] in general_config['devices']
            data = data[rows]
            
            
        y = data[xy_config['y_col']]
        X = data[xy_config['X_cols']]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=general_config['test_perc'])
        import ipdb; ipdb.set_trace()
        
        
            
Model(data_df = modeling_df, config=first_config)

--Return--
None
> [0;32m<ipython-input-114-d318f46a9247>[0m(37)[0;36m_generate_data_splits[0;34m()[0m
[0;32m     36 [0;31m        [0mX_train[0m[0;34m,[0m [0mX_test[0m[0;34m,[0m [0my_train[0m[0;34m,[0m [0my_test[0m [0;34m=[0m [0mtrain_test_split[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m,[0m [0mtest_size[0m[0;34m=[0m[0mgeneral_config[0m[0;34m[[0m[0;34m'test_perc'[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 37 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     38 [0;31m[0;34m[0m[0m
[0m
ipdb> X_train.shape
(6456, 8)
ipdb> X_test.shape
(1615, 8)
ipdb> y_train.shape
(6456,)
ipdb> y_test.shape
(1615,)
ipdb> quit


BdbQuit: 