In [1]:
from dataPiping import *

import numpy as np
import pandas as pd

In [42]:
trainPeriod = ["2015-02-01", "2016-02-01"]
unixTrainPeriod = list(map(makeunixtime, trainPeriod))

In [12]:
''' 
    Calculate missing startUserTime values if possible (by applying difference between user's other startUserTime and startTime values)
'''
def _discardNanStartUserTime(df):
    nanStartUserTimeCust = df[df.startUserTime.isnull()].customerId.unique()
    return df[~df.customerId.isin(nanStartUserTimeCust)]

def _calcStartUserTimeNan(g):
    isNan = g.startUserTime.isnull()
    
    if isNan.all():
        return g
    
    if isNan.any():
        timeOffset = g[~isNan].iloc[0].startUserTime - g[~isNan].iloc[0].startTime
        for i, row in g[isNan].iterrows():
            g.set_value(i, 'startUserTime', row.startTime + timeOffset)
        
    return g

def replaceNanStartUserTime(df):
    df = df.groupby('customerId').apply(_calcStartUserTimeNan)
    return _discardNanStartUserTime(df)

In [49]:
'''
    Aggregate users
'''

def _aggrUser(user):
    nSessions = len(user)
    period = unixTrainPeriod[1] - user.startTime.values[0]
    frequency = 0
    if period > 0:
        frequency = nSessions/period
    recency = unixTrainPeriod[1] - user.endTime.values[-1]
    avgViewOnly = user.viewonly.sum()/nSessions
    avgChangeThumbnail = user.changeThumbnail.sum()/nSessions
    avgImageZoom = user.imageZoom.sum()/nSessions
    avgWatchVideo = user.watchVideo.sum()/nSessions
    avgView360 = user.view360.sum()/nSessions
    mobile = (user.device == 'mobile').sum()/nSessions
    desktop = (user.device == 'desktop').sum()/nSessions
    android = (user.device == 'android').sum()/nSessions
    ios = (user.device == 'ios').sum()/nSessions
    returnTime = user.returnTime.values[-1]
    
    return pd.DataFrame({
        'nSessions': nSessions, 
        'period': period,
        'frequency': frequency,
        'recency': recency,
        'avgViewOnly': avgViewOnly,
        'avgChangeThumbnail': avgChangeThumbnail,
        'avgImageZoom': avgImageZoom,
        'avgWatchVideo': avgWatchVideo,
        'avgView360': avgView360,
        'device[mobile]': mobile,
        'device[desktop]': desktop,
        'device[android]': android,
        'device[ios]': ios,
        'returnTime': returnTime
    },index=[0])

def aggrUsers(df):
    aggr = df.groupby('customerId').apply(_aggrUser)
    return aggr

In [71]:
def createAggrDataset():
    df = pd.read_pickle('../../data/mergedSessionDF.pkl')

    df = replaceNanStartUserTime(df)

    obs_df = df[(df.startUserTime >= makeunixtime(trainPeriod[0])) & (df.startUserTime < makeunixtime(trainPeriod[1]))]

    obs_df_aggr = aggrUsers(obs_df)

    obs_df_aggr.reset_index(inplace=True)
    del obs_df_aggr['customerId']

    del obs_df_aggr['level_1']

    obs_df_aggr_nonan = obs_df_aggr[~obs_df_aggr.returnTime.isnull()]

    obs_df_aggr_nonan.reset_index(inplace=True)

    y = obs_df_aggr_nonan.returnTime
    X = obs_df_aggr_nonan.copy()
    del X['returnTime']

    del X['index']
    
    X.to_pickle('../../data/aggregate/aggrFebNoNanX.pkl')
    y.to_pickle('../../data/aggregate/aggrFebNoNanY.pkl')

In [None]:
def readAggrData():
    