In [7]:
from dataPiping import *

import numpy as np
import pandas as pd

In [8]:
trainPeriod = ["2015-02-01", "2016-02-01"]
unixTrainPeriod = list(map(makeunixtime, trainPeriod))

In [9]:
''' 
    Calculate missing startUserTime values if possible (by applying difference between user's other startUserTime and startTime values)
'''
def _discardNanStartUserTime(df):
    nanStartUserTimeCust = df[df.startUserTime.isnull()].customerId.unique()
    return df[~df.customerId.isin(nanStartUserTimeCust)]

def _calcStartUserTimeNan(g):
    isNan = g.startUserTime.isnull()
    
    if isNan.all():
        return g
    
    if isNan.any():
        timeOffset = g[~isNan].iloc[0].startUserTime - g[~isNan].iloc[0].startTime
        for i, row in g[isNan].iterrows():
            g.set_value(i, 'startUserTime', row.startTime + timeOffset)
        
    return g

def replaceNanStartUserTime(df):
    df = df.groupby('customerId').apply(_calcStartUserTimeNan)
    return _discardNanStartUserTime(df)

In [10]:
'''
    Aggregate users
'''

def _aggrUser(user):
    nSessions = len(user)
    period = unixTrainPeriod[1] - user.startTime.values[0]
    frequency = 0
    if period > 0:
        frequency = nSessions/period
    recency = unixTrainPeriod[1] - user.endTime.values[-1]
    avgViewOnly = user.viewonly.sum()/nSessions
    avgChangeThumbnail = user.changeThumbnail.sum()/nSessions
    avgImageZoom = user.imageZoom.sum()/nSessions
    avgWatchVideo = user.watchVideo.sum()/nSessions
    avgView360 = user.view360.sum()/nSessions
    mobile = (user.device == 'mobile').sum()/nSessions
    desktop = (user.device == 'desktop').sum()/nSessions
    android = (user.device == 'android').sum()/nSessions
    ios = (user.device == 'ios').sum()/nSessions
    returnTime = user.returnTime.values[-1]
    
    return pd.DataFrame({
        'nSessions': nSessions, 
        'period': period,
        'frequency': frequency,
        'recency': recency,
        'avgViewOnly': avgViewOnly,
        'avgChangeThumbnail': avgChangeThumbnail,
        'avgImageZoom': avgImageZoom,
        'avgWatchVideo': avgWatchVideo,
        'avgView360': avgView360,
        'device[mobile]': mobile,
        'device[desktop]': desktop,
        'device[android]': android,
        'device[ios]': ios,
        'returnTime': returnTime
    },index=[0])

def aggrUsers(df):
    aggr = df.groupby('customerId').apply(_aggrUser)
    return aggr

In [11]:
def createAggrDataset():
    df = pd.read_pickle('../../data/mergedSessionDF.pkl')

    df = replaceNanStartUserTime(df)

    obs_df = df[(df.startUserTime >= makeunixtime(trainPeriod[0])) & (df.startUserTime < makeunixtime(trainPeriod[1]))]

    obs_df_aggr = aggrUsers(obs_df)

    obs_df_aggr.reset_index(inplace=True)
    del obs_df_aggr['customerId']

    del obs_df_aggr['level_1']

    obs_df_aggr_nonan = obs_df_aggr[~obs_df_aggr.returnTime.isnull()]

    obs_df_aggr_nonan.reset_index(inplace=True)

    y = obs_df_aggr_nonan.returnTime
    X = obs_df_aggr_nonan.copy()
    del X['returnTime']

    del X['index']
    
    X.to_pickle('../../data/aggregate/aggrFebNoNanX.pkl')
    y.to_pickle('../../data/aggregate/aggrFebNoNanY.pkl')

In [12]:
def readAggrData():
    X = pd.read_pickle('../../data/aggregate/aggrFebNoNanX.pkl')
    y = pd.read_pickle('../../data/aggregate/aggrFebNoNanY.pkl')
    return X, y

In [13]:
def createAggrCoxPhDataset():
    df = pd.read_pickle('../../data/mergedSessionDF.pkl')

    df = replaceNanStartUserTime(df)

    df['previousGap'] = df.returnTime.shift(1)

    df.loc[df.previousGap.isnull(),'previousGap'] = -1

    obs_df = df[(df.startUserTime >= makeunixtime(trainPeriod[0])) & (df.startUserTime < makeunixtime(trainPeriod[1]))]

    obs_df_aggr = aggrUsersCoxPh(obs_df)

    obs_df_aggr.reset_index(inplace=True)

    del obs_df_aggr['customerId']
    del obs_df_aggr['level_1']

    obs_df_aggr_nonan = obs_df_aggr[~obs_df_aggr.returnTime.isnull()]

    obs_df_aggr_cens = obs_df_aggr.copy()

    obs_df_aggr_cens['observed'] = ~obs_df_aggr_cens.returnTime.isnull()

    obs_df_aggr_cens.loc[~obs_df_aggr_cens.observed, 'returnTime'] = makeunixtime('2016-07-01') - obs_df_aggr_cens.loc[~obs_df_aggr_cens.observed, 'endTime']

    del obs_df_aggr_nonan['endTime']
    del obs_df_aggr_cens['endTime']

    y_nonan = obs_df_aggr_nonan.returnTime
    X_nonan = obs_df_aggr_nonan.copy()
    del X_nonan['returnTime']

    y_cens = obs_df_aggr_cens.returnTime
    X_cens = obs_df_aggr_cens.copy()
    del X_cens['returnTime']

    X_nonan.to_pickle('../../data/aggregate/aggrFebCoxPhNoNanX.pkl')
    y_nonan.to_pickle('../../data/aggregate/aggrFebCoxPhNoNanY.pkl')
    X_cens.to_pickle('../../data/aggregate/aggrFebCoxPhCensX.pkl')
    y_cens.to_pickle('../../data/aggregate/aggrFebCoxPhCensY.pkl')

In [10]:
createAggrCoxPhDataset()

In [14]:
def readAggrCoxPhData(include_cens=False):
    if include_cens:
        X = pd.read_pickle('../../data/aggregate/aggrFebCoxPhCensX.pkl')
        y = pd.read_pickle('../../data/aggregate/aggrFebCoxPhCensY.pkl')
        return X, y
    else:
        X = pd.read_pickle('../../data/aggregate/aggrFebCoxPhNoNanX.pkl')
        y = pd.read_pickle('../../data/aggregate/aggrFebCoxPhNoNanY.pkl')
        return X, y

In [15]:
'''
    Aggregate users (freq, noVisits, prevGap, twrt)
'''

def _aggrUserCoxPh(user):
    nSessions = len(user)
    period = unixTrainPeriod[1] - user.startTime.values[0]
    frequency = 0
    if period > 0:
        frequency = nSessions/period
    recency = unixTrainPeriod[1] - user.endTime.values[-1]
    
    twrt = 0
    if len(user.startUserTime.values) > 1:
        timeSince = 1/(unixTrainPeriod[1] - user.startUserTime.values[:-1])
        twrt = np.average(user.returnTime.values[:-1], weights=timeSince)
    else:
        twrt = -1
        
    sessionAvgPrice = user.avgPrice.values[-1]
    if np.isnan(sessionAvgPrice):
        sessionAvgPrice = 0
    
    return pd.DataFrame({
        'sessionNDivisions': user.numberdivisions.values[-1],
        'sessionAvgPrice': sessionAvgPrice,
        'sessionViewonly': user.viewonly.values[-1],
        'sessionChangeThumbnail': user.changeThumbnail.values[-1],
        'sessionImageZoom': user.imageZoom.values[-1],
        'sessionWatchVideo': user.watchVideo.values[-1],
        'sessionView360': user.view360.values[-1],
        'sessionDevice[mobile]': int(user.device.values[-1] == 'mobile'),
        'sessionDevice[desktop]': int(user.device.values[-1] == 'desktop'),
        'sessionDevice[android]': int(user.device.values[-1] == 'android'),
        'sessionDevice[ios]': int(user.device.values[-1] == 'ios'),
        'frequency': frequency,
        'noSessions': nSessions,
        'prevGap': user.previousGap.values[-1],
        'twrt': twrt,
        'returnTime': user.returnTime.values[-1],
        'endTime': user.endTime.values[-1]
    },index=[0])

def aggrUsersCoxPh(df):
    aggr = df.groupby('customerId').apply(_aggrUserCoxPh)
    return aggr

In [18]:
def createAggrCoxPhDataset2():
    df = pd.read_pickle('../../data/mergedSessionDF.pkl')

    df = replaceNanStartUserTime(df)

    df['previousGap'] = df.returnTime.shift(1)

    df.loc[df.previousGap.isnull(),'previousGap'] = -1

    obs_df = df[(df.startUserTime >= makeunixtime(trainPeriod[0])) & (df.startUserTime < makeunixtime(trainPeriod[1]))]

    obs_df_aggr = aggrUsersCoxPh(obs_df)

    obs_df_aggr.reset_index(inplace=True)

    del obs_df_aggr['customerId']
    del obs_df_aggr['level_1']

    obs_df_aggr_nonan = obs_df_aggr[~obs_df_aggr.returnTime.isnull()]

    obs_df_aggr_cens = obs_df_aggr.copy()

    obs_df_aggr_cens['observed'] = ~obs_df_aggr_cens.returnTime.isnull()

    obs_df_aggr_cens.loc[~obs_df_aggr_cens.observed, 'returnTime'] = makeunixtime('2016-07-01') - unixTrainPeriod[1]
    obs_df_aggr_cens.loc[obs_df_aggr_cens.observed, 'returnTime'] = obs_df_aggr_cens.loc[obs_df_aggr_cens.observed, 'returnTime'] - (unixTrainPeriod[1] - obs_df_aggr_cens.loc[obs_df_aggr_cens.observed, 'endTime'])

    del obs_df_aggr_nonan['endTime']
    del obs_df_aggr_cens['endTime']

    y_nonan = obs_df_aggr_nonan.returnTime
    X_nonan = obs_df_aggr_nonan.copy()
    del X_nonan['returnTime']

    y_cens = obs_df_aggr_cens.returnTime
    X_cens = obs_df_aggr_cens.copy()
    del X_cens['returnTime']

    X_nonan.to_pickle('../../data/aggregate/aggrFebCoxPhNoNanX2.pkl')
    y_nonan.to_pickle('../../data/aggregate/aggrFebCoxPhNoNanY2.pkl')
    X_cens.to_pickle('../../data/aggregate/aggrFebCoxPhCensX2.pkl')
    y_cens.to_pickle('../../data/aggregate/aggrFebCoxPhCensY2.pkl')

In [19]:
createAggrCoxPhDataset2()

In [5]:
'''
    Aggregate users (freq, noVisits, prevGap, twrt)
'''

def _aggrUserCoxPh2(user):
    nSessions = len(user)
    period = unixTrainPeriod[1] - user.startTime.values[0]
    frequency = 0
    if period > 0:
        frequency = nSessions/period
    recency = unixTrainPeriod[1] - user.endTime.values[-1]
    
    twrt = 0
    if len(user.startUserTime.values) > 1:
        timeSince = 1/(unixTrainPeriod[1] - user.startUserTime.values[:-1])
        twrt = np.average(user.returnTime.values[:-1], weights=timeSince)
    else:
        twrt = -1
        
    sessionAvgPrice = user.avgPrice.values[-1]
    if np.isnan(sessionAvgPrice):
        sessionAvgPrice = 0
    
    return pd.DataFrame({
        'sessionNDivisions': user.numberdivisions.values[-1],
        'sessionAvgPrice': sessionAvgPrice,
        'sessionViewonly': user.viewonly.values[-1],
        'sessionChangeThumbnail': user.changeThumbnail.values[-1],
        'sessionImageZoom': user.imageZoom.values[-1],
        'sessionWatchVideo': user.watchVideo.values[-1],
        'sessionView360': user.view360.values[-1],
        'sessionDevice[mobile]': int(user.device.values[-1] == 'mobile'),
        'sessionDevice[desktop]': int(user.device.values[-1] == 'desktop'),
        'sessionDevice[android]': int(user.device.values[-1] == 'android'),
        'sessionDevice[ios]': int(user.device.values[-1] == 'ios'),
        'frequency': frequency,
        'noSessions': nSessions,
        'prevGap': user.previousGap.values[-1],
        'twrt': twrt,
        'returnTime': user.returnTime.values[-1],
        'endTime': user.endTime.values[-1]
    },index=[0])

def aggrUsersCoxPh2(df):
    aggr = df.groupby('customerId').apply(_aggrUserCoxPh)
    return aggr