In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
def score(res, label, pre):
    """
    This function merges the predicted and actual labels into a dataframe and calculates the score based on this dataframe.
    
    res: A dataframe containing both predicted and actual labels.
    label: The column name for the actual label in the dataframe.
    pre: The column name for the predicted label in the dataframe.
    """
    
    # Add a new column 'diff' to the dataframe 'res'. The 'diff' is the absolute difference between the predicted and actual labels divided by 7.
    res['diff'] = abs((res[pre] - res[label]) / 7)
    
    # Calculate the score as 1 minus the sum of 'diff' divided by the number of entries in the dataframe. The score is returned.
    s = 1 - sum(res['diff']) / len(res)
    return s


In [3]:
def get_label(row):
    """
    Based on the user's ID and the specific date required, calculate the user's 7-day retention score for the current date.
    row: A row in df_
    """
    
    # Extract the elements from the row
    max_date = row['date_max'] # Latest date of login record
    min_date = row['date_min'] # Earliest date of login record
    label_date = row['date'] # The date that needs to be labeled. We need at least 7 days of login data after this date.
    date_list = row['date_list'] # Specific list of login dates
    
    # Set different conditions to calculate the label
    
    # The most common condition
    # The gap between the last login date (max_date) and label_date is more than 7 days
    # For example, the last login date is 130, and label_date is 120
    # This indicates that we have login records for the 7 days after label_date in the data, so we can calculate the 7-day retention score for label_date
    # Then count the number of login dates that appear within 7 days after date
    if label_date + 7 <= max_date:
        return sum([1 for x in set(date_list) if label_date < x < label_date+8])
    
    # If the gap between the last recorded login date (max_date) and label_date is less than 7 days
    # For example, the last login date is 125, and label_date is 120
    # This means that the current data does not have login records for the 7 days after label_date, so we can't calculate the 7-day retention score for label_date
    else:
        # Start to judge, is label_date after 153? If so, it might be due to missing records rather than no login
        if label_date > 153:
            # Further verify the user ID: Is this user in the test set? If not, mark it as -999 and wait for subsequent removal
            if row['user_id'] not in user_enddate:
                return -999
            # If the user ID is indeed in the test set, then make further judgement
            else:
                # If the date to be predicted for this user ID is less than 7 days from the label_date we are currently labeling
                # For example, the date to be predicted for the user ID is 165, and label_date is 160, 165<160+7
                # The interval covered by label_date is now in the future of the date to be predicted for the user ID
                # Since we can't use the future to predict the past, we need to remove such label_date
                # So directly mark as -999 and wait for deletion
                if user_enddate[row['user_id']] < label_date+7:
                    return -999
                # If there is no issue of predicting the past with the future, then regardless of whether there are complete records for the next 7 days, calculate the 7-day retention score based on the existing records
                else:
                    return sum([1 for x in set(date_list) if label_date < x < label_date+8])
        # If label_date is before 130, it is considered that there is no record and it is far from the future to be predicted, so it can be directly marked as -999 and wait for deletion
        elif label_date < 130
            return -999
        # If it is not outside the [130,154] interval, then calculate the 7-day retention score directly based on the existing records
        else:
            return sum([1 for x in set(date_list) if label_date < x < label_date+8])

In [4]:
interval = [x for x in range(131,154)]

In [5]:
def extend_list(row):
    """
    Modify each row of data (the login record list for each user), find the intersection between the maximum interval where the user can calculate the retention score and [131,154].
    Before running this function, you must first set the variable user_enddate.
    """
    
    # Extract the earliest and latest login dates for each user
    date_min = row['date_min']
    date_max = row['date_max']
    
    # Set the effective label date area
    interval = [x for x in range(131,154)]
    
    # Add the date to be tested for each user ID, all the login records we use must be earlier than 7 days before this test date
    end_date = user_enddate[row['user_id']]
    
    # For each row of data, return the days in the intersection we specified
    return list(set([x for x in range(date_min,end_date-6)]+interval))

In [6]:
df = pd.read_csv('data/app_launch_logs.csv')
test_a = pd.read_csv('data/test_A.csv')
test = pd.read_csv('data/test_B.csv')

In [7]:
test.columns = ['user_id','date']

In [8]:
df = df.sort_values(['user_id','date']).reset_index(drop=True)
df = df[['user_id','date']].drop_duplicates().reset_index(drop=True)
df = df[df['user_id'].isin(test_a['user_id'])]

In [9]:
df_group = df.groupby('user_id').agg(list).reset_index()
df_group['date_max'] = df_group['date'].apply(lambda x: max(x))
df_group['date_min'] = df_group['date'].apply(lambda x: min(x))
user_enddate = dict(zip(test_a['user_id'],test_a['end_date']))

In [10]:
df_group['date_all'] = df_group.apply(extend_list, axis=1)

In [11]:
df_ = df_group.explode('date_all')
df_.rename(columns = {'date':'date_list','date_all':'date'},inplace=True)

In [12]:
df_['label'] = df_.apply(get_label, axis=1)
train = df_[df_['label']!=-999]

In [13]:
# Construct training and test sets
train[['user_id','date','label']].to_csv('data/online_trainb.csv',index=False)
test['label'] = -1
test[['user_id','date','label']].to_csv('data/online_testb.csv',index=False)

### 1. Duration and number of videos watched features

In [14]:
user_pb = pd.read_csv('data/user_playback_data.csv')
user_pb['count'] = 1
user_pb_group = user_pb[['user_id','date','playtime','count']].groupby(['user_id','date'],as_index=False).agg(sum)
user_pb_group = user_pb_group[user_pb_group['user_id'].isin(test['user_id'])]

In [15]:
train = pd.merge(train, user_pb_group, how='left')
train.rename(columns = {'playtime':'playtime_last'+str(0),'count':'video_count_last'+str(0)},inplace=True)

test = pd.merge(test, user_pb_group, how='left')
test.rename(columns = {'playtime':'playtime_last'+str(0),'count':'video_count_last'+str(0)},inplace=True)

In [16]:
for i in range(7):
    user_pb_group['date'] = user_pb_group['date'] + 1
    train = pd.merge(train, user_pb_group, how='left')
    train.rename(columns = {'playtime':'playtime_last'+str(i+1),'count':'video_count_last'+str(i+1)},inplace=True)
    
    test = pd.merge(test, user_pb_group, how='left')
    test.rename(columns = {'playtime':'playtime_last'+str(i+1),'count':'video_count_last'+str(i+1)},inplace=True)

In [17]:
train = train.fillna(0)
test = test.fillna(0)

In [18]:
pb_feats = []
for i in range(8):
    pb_feats.append('playtime_last'+str(i))
    pb_feats.append('video_count_last'+str(i))

In [19]:
train[['user_id','date']+pb_feats].to_csv('features/online_train_pb.csv',index=False)
test[['user_id','date']+pb_feats].to_csv('features/online_test_pb.csv',index=False)

### 2. Personal information features

In [None]:
user_trait = pd.read_csv('data/user_portrait_data.csv')

In [None]:
def deal_ram_rom(x):
    if type(x)==float:
        return np.nan
    elif len(x)==1:
        return int(x[0])
    else:
        return np.mean([eval(i) for i in x])

In [None]:
for i in ['device_ram','device_rom']:
    user_trait['ls_'+i] = user_trait[i].apply(lambda x: x.split(';') if type(x)==str else np.nan)
    user_trait[i+'_new'] = user_trait['ls_'+i].apply(lambda x: deal_ram_rom(x))

In [None]:
trait_feats = ['device_type','sex','age','education','occupation_status','device_ram_new','device_rom_new']

In [None]:
user_trait[['user_id']+trait_feats].to_csv('features/user_trait_feature.csv',index=False)

### 3. Time interval since the last login

In [20]:
def get_last_diff(row):
    date_list = row['date_list']
    date_now = row['date']
    
    ls = [x for x in date_list if x<=date_now]
    
    return date_now - max(ls) if len(ls)>0 else np.nan

In [21]:
df_group.rename(columns={'date':'date_list'},inplace=True)

In [22]:
test = pd.merge(test,df_group[['user_id','date_list']],how='left')

In [23]:
train['diff_near'] = train.apply(get_last_diff, axis=1)
test['diff_near'] = test.apply(get_last_diff, axis=1)

### 4. Whether logged in on the day

In [25]:
def is_launch(row):
    return 1 if row['date'] in row['date_list'] else 0

In [26]:
train['is_launch'] = train.apply(is_launch, axis=1)
test['is_launch'] = test.apply(is_launch, axis=1)

### 5. Login type

In [27]:
from scipy import stats
launch = pd.read_csv('data/app_launch_logs.csv')
launch = launch[launch['user_id'].isin(test_a['user_id'])]

In [28]:
launch_type = launch.groupby(['user_id','date'],as_index=False).agg(list)

In [29]:
launch_type['len'] = launch_type['launch_type'].apply(lambda x: len(x))

In [30]:
def encode_launch_type(row):
    length = row['len']
    ls = row['launch_type']
    
    if length==2:
        return 2
    else:
        return ls[0]

In [31]:
launch_type['launch_type_new'] = launch_type.apply(encode_launch_type, axis=1)

In [32]:
launch_type['launch_type_new'] = launch_type['launch_type_new']+1
launch_type['launch_type_new'] = launch_type['launch_type_new'].fillna(0)

In [33]:
train = pd.merge(train, launch_type[['user_id','date','launch_type_new']], how='left')
test = pd.merge(test, launch_type[['user_id','date','launch_type_new']], how='left')

### 6. Total historical number of logins

In [34]:
def GetLaunchNum(row):
    end_date_ = row['date']
    date_list = row['date_list']
    
    return sum([1 for x in date_list if x<= end_date_])

In [35]:
train['launchNum'] = train.apply(GetLaunchNum, axis=1)
test['launchNum'] = test.apply(GetLaunchNum, axis=1)

### 7. Number of logins in the past week

In [36]:
def GetNumLastWeek(row):
    end_date_ = row['date']
    date_list = row['date_list']
    
    return sum([1 for x in date_list if x<= end_date_ and x > end_date_-7])

In [37]:
train['NumLastWeek'] = train.apply(GetNumLastWeek, axis=1)
test['NumLastWeek'] = test.apply(GetNumLastWeek, axis=1)

### 8. Median label of the previous month and mean label of the previous four weeks

In [38]:
train = train.sort_values(['user_id','date']).reset_index(drop=True)
train_sta = train[['user_id','date','label']].groupby('user_id',as_index=False).agg(list)
train_sta.columns = ['user_id','date_all_list','label_list']
train = pd.merge(train,train_sta,how='left')

In [39]:
test = test.sort_values(['user_id','date']).reset_index(drop=True)
df_ = df_.sort_values(['user_id','date']).reset_index(drop=True)
df_sta = df_[['user_id','date','label']].groupby('user_id',as_index=False).agg(list)
df_sta.columns = ['user_id','date_all_list','label_list']
test = pd.merge(test,df_sta,how='left')

In [40]:
def get_his_label(row):
    end_date = row['date']
    date_all = row['date_all_list']
    ls_label = row['label_list']
    
    ls_new = [x for x in date_all if x+7<=end_date]
    
    return ls_label[:len(ls_new)]

In [41]:
train['label_his_list'] = train.apply(get_his_label, axis=1)
train['preds_median_30'] = train['label_his_list'].apply(lambda x: np.median(x[-30:]))
train['preds_mean_4'] = train['label_his_list'].apply(lambda x: np.mean([x[i] for i in range(-1*len(x),0) if i==-1 or i==-8 or i==-15 or i==-22]))

In [42]:
test['label_his_list'] = test.apply(get_his_label, axis=1)
test['preds_median_30'] = test['label_his_list'].apply(lambda x: np.median(x[-30:]))
test['preds_mean_4'] = test['label_his_list'].apply(lambda x: np.mean([x[i] for i in range(-1*len(x),0) if i==-1 or i==-8 or i==-15 or i==-22]))

In [43]:
def Get_mean_4_weighted(row):
    tmp = row['label_his_list']
    
    if len(tmp) >= 22:
        return tmp[-1] * 0.4 + tmp[-8] * 0.3 + tmp[-15] * 0.2 + tmp[-22] * 0.1
    elif len(tmp) >= 15:
        return tmp[-1] * 0.4 + tmp[-8] * 0.3 + tmp[-15] * 0.2
    elif len(tmp) >= 8:
        return tmp[-1] * 0.4 + tmp[-8] * 0.3
    elif len(tmp) >= 1:
        return tmp[-1] * 0.4
    else:
        return 0

In [44]:
train['preds_mean_4_weighted'] = train.apply(Get_mean_4_weighted,axis=1)

In [45]:
test['preds_mean_4_weighted'] = test.apply(Get_mean_4_weighted,axis=1)

### 9. Weighted median

In [46]:
import sys
sys.path.append('wquantiles-0.6/')
import wquantiles

In [47]:
def GetWeightedMedian(row):
    tmp = row['label_his_list']
    tmp = tmp[-30:]
    #weight = np.array([0.25 for x in range(9)] + [0.5 for x in range(7)] + [1 for x in range(7)] + [2 for x in range(7)]  )
    weight = np.array([(x+1)/(30) for x in range(30)])
    
    if len(tmp) >= 30:
        return wquantiles.median(np.array(tmp),weight)
    else:
        tmp = [0 for x in range(30-len(tmp))] + tmp
        return wquantiles.median(np.array(tmp),weight)

In [48]:
train['weighted_median'] = train.apply(GetWeightedMedian, axis=1)
test['weighted_median'] = test.apply(GetWeightedMedian, axis=1)

In [49]:
launch_feats = ['diff_near','is_launch','launch_type_new','launchNum','NumLastWeek','preds_median_30',
                'preds_mean_4','preds_mean_4_weighted','weighted_median']

In [50]:
train[['user_id','date']+launch_feats].to_csv('features/launch_online_train.csv',index=False)
test[['user_id','date']+launch_feats].to_csv('features/launch_online_test.csv',index=False)