In [1]:
from dataPiping import *

import numpy as np
import pandas as pd

In [42]:
trainPeriod = ["2015-02-01", "2016-02-01"]
unixTrainPeriod = list(map(makeunixtime, trainPeriod))

In [12]:
''' 
    Calculate missing startUserTime values if possible (by applying difference between user's other startUserTime and startTime values)
'''
def _discardNanStartUserTime(df):
    nanStartUserTimeCust = df[df.startUserTime.isnull()].customerId.unique()
    return df[~df.customerId.isin(nanStartUserTimeCust)]

def _calcStartUserTimeNan(g):
    isNan = g.startUserTime.isnull()
    
    if isNan.all():
        return g
    
    if isNan.any():
        timeOffset = g[~isNan].iloc[0].startUserTime - g[~isNan].iloc[0].startTime
        for i, row in g[isNan].iterrows():
            g.set_value(i, 'startUserTime', row.startTime + timeOffset)
        
    return g

def replaceNanStartUserTime(df):
    df = df.groupby('customerId').apply(_calcStartUserTimeNan)
    return _discardNanStartUserTime(df)

In [49]:
'''
    Aggregate users
'''

def _aggrUser(user):
    nSessions = len(user)
    period = unixTrainPeriod[1] - user.startTime.values[0]
    frequency = 0
    if period > 0:
        frequency = nSessions/period
    recency = unixTrainPeriod[1] - user.endTime.values[-1]
    avgViewOnly = user.viewonly.sum()/nSessions
    avgChangeThumbnail = user.changeThumbnail.sum()/nSessions
    avgImageZoom = user.imageZoom.sum()/nSessions
    avgWatchVideo = user.watchVideo.sum()/nSessions
    avgView360 = user.view360.sum()/nSessions
    mobile = (user.device == 'mobile').sum()/nSessions
    desktop = (user.device == 'desktop').sum()/nSessions
    android = (user.device == 'android').sum()/nSessions
    ios = (user.device == 'ios').sum()/nSessions
    returnTime = user.returnTime.values[-1]
    
    return pd.DataFrame({
        'nSessions': nSessions, 
        'period': period,
        'frequency': frequency,
        'recency': recency,
        'avgViewOnly': avgViewOnly,
        'avgChangeThumbnail': avgChangeThumbnail,
        'avgImageZoom': avgImageZoom,
        'avgWatchVideo': avgWatchVideo,
        'avgView360': avgView360,
        'device[mobile]': mobile,
        'device[desktop]': desktop,
        'device[android]': android,
        'device[ios]': ios,
        'returnTime': returnTime
    },index=[0])

def aggrUsers(df):
    aggr = df.groupby('customerId').apply(_aggrUser)
    return aggr

In [71]:
def createAggrDataset():
    df = pd.read_pickle('../../data/mergedSessionDF.pkl')

    df = replaceNanStartUserTime(df)

    obs_df = df[(df.startUserTime >= makeunixtime(trainPeriod[0])) & (df.startUserTime < makeunixtime(trainPeriod[1]))]

    obs_df_aggr = aggrUsers(obs_df)

    obs_df_aggr.reset_index(inplace=True)
    del obs_df_aggr['customerId']

    del obs_df_aggr['level_1']

    obs_df_aggr_nonan = obs_df_aggr[~obs_df_aggr.returnTime.isnull()]

    obs_df_aggr_nonan.reset_index(inplace=True)

    y = obs_df_aggr_nonan.returnTime
    X = obs_df_aggr_nonan.copy()
    del X['returnTime']

    del X['index']
    
    X.to_pickle('../../data/aggregate/aggrFebNoNanX.pkl')
    y.to_pickle('../../data/aggregate/aggrFebNoNanY.pkl')

In [73]:
X

Unnamed: 0,avgChangeThumbnail,avgImageZoom,avgView360,avgViewOnly,avgWatchVideo,device[android],device[desktop],device[ios],device[mobile],frequency,nSessions,period,recency
0,16.687500,2.062500,0.000000,11.812500,0.250000,0.0,0.500000,0.250000,0.250000,5.229271e-07,16,30596997,5.451250e+06
1,2.133333,0.000000,0.000000,2.733333,0.000000,0.0,0.400000,0.200000,0.400000,4.794448e-07,15,31286186,5.342800e+04
2,7.333333,0.000000,0.000000,6.933333,0.000000,0.0,0.200000,0.400000,0.400000,6.762549e-07,15,22180984,1.215075e+06
3,3.699717,0.473088,0.008499,3.317280,0.237960,0.0,0.756374,0.203966,0.039660,1.122138e-05,353,31457816,5.110000e+03
4,4.500000,0.000000,0.000000,3.666667,0.000000,0.0,0.333333,0.166667,0.500000,3.211868e-07,6,18680717,8.998177e+06
5,1.285714,0.000000,0.000000,1.285714,0.000000,0.0,0.000000,0.000000,1.000000,6.522671e-07,7,10731800,2.611417e+06
6,5.550898,0.173653,0.000000,6.041916,0.155689,0.0,0.077844,0.113772,0.802395,5.303685e-06,167,31487543,8.663264e+05
7,1.807692,0.250000,0.000000,3.480769,0.000000,0.0,0.865385,0.000000,0.134615,1.796376e-06,52,28947168,2.609470e+05
8,0.791667,0.008333,0.000000,8.883333,0.000000,0.0,0.891667,0.075000,0.033333,3.811596e-06,120,31482878,1.508826e+06
9,0.686275,0.176471,0.000000,23.686275,0.029412,0.0,0.019608,0.980392,0.000000,3.259937e-06,102,31288947,1.448500e+05


In [72]:
def readAggrData():
    X = pd.read_pickle('../../data/aggregate/aggrFebNoNanX.pkl')
    y = pd.read_pickle('../../data/aggregate/aggrFebNoNanY.pkl')
    return X, y