In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import catboost as cat
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

In [2]:
def score(res,label,pre):
    res['diff'] = abs((res[pre]-res[label])/7)
    s = 1-sum(res['diff'])/len(res)
    return s

def get_label(row):
    max_date = row['date_max']
    min_date = row['date_min']
    date = row['date']
    date_list = row['date_list']
    
    if date+7 <= max_date:
        return sum([1 for x in set(date_list) if date < x < date+8])
    else:
        if date>153:
            if row['user_id'] not in user_enddate:
                return -999
            else:
                if user_enddate[row['user_id']] < date+7:
                    return -999
                else:
                    return sum([1 for x in set(date_list) if date < x < date+8])
        elif date<130:
            return -999
        else:
            return sum([1 for x in set(date_list) if date < x < date+8])
        
def extend_list(row):
    date_min = row['date_min']
    date_max = row['date_max']
    
    if row['user_id'] in user_enddate:
        end_date = user_enddate[row['user_id']]
        return list(set([x for x in range(date_min-1, end_date-6)]+interval))
    else:
        return list(set([x for x in range(date_min-1, date_max-6)]+interval))
    
    
def get_date_list(row):
    ls = row['date_list']
    end_date = row['date']
    
    ls_new = [x for x in ls if x<=end_date]
    return ls_new

In [3]:
interval = [x for x in range(131,154)]

### Handle B dataset

In [4]:
df = pd.read_csv('data/app_launch_logs.csv')
test_a = pd.read_csv('data/test_B数据集.csv')

test = pd.read_csv('data/test_B数据集.csv')
test.columns = ['user_id','date']

In [5]:
df = df.sort_values(['user_id','date']).reset_index(drop=True)
df = df[['user_id','date']].drop_duplicates().reset_index(drop=True)
df = df[df['user_id'].isin(test_a['user_id'])]

In [6]:
df_group = df.groupby('user_id').agg(list).reset_index()
df_group['date_max'] = df_group['date'].apply(lambda x: max(x))
df_group['date_min'] = df_group['date'].apply(lambda x: min(x))
user_enddate = dict(zip(test_a['user_id'],test_a['end_date']))

In [7]:
df_group['date_all'] = df_group.apply(extend_list, axis=1)

In [8]:
df_ = df_group.explode('date_all')
df_.rename(columns = {'date':'date_list','date_all':'date'},inplace=True)

In [9]:
df_['label'] = df_.apply(get_label, axis=1)

In [10]:
train = df_[df_['label']!=-999]
train = train.reset_index(drop=True)

In [11]:
train_pb = pd.read_csv('features/online_train_pb.csv')
train = pd.merge(train, train_pb, how='left')

test_pb = pd.read_csv('features/online_test_pb.csv')
test = pd.merge(test, test_pb, how='left')

In [12]:
user_trait = pd.read_csv('features/user_trait_feature.csv')

train = pd.merge(train, user_trait, how='left')
test = pd.merge(test, user_trait, how='left')

In [13]:
launch_train = pd.read_csv('features/launch_online_train.csv')
train = pd.merge(train, launch_train, how='left')

launch_test = pd.read_csv('features/launch_online_test.csv')
test = pd.merge(test, launch_test, how='left')

In [14]:
train['week'] = train['date'].apply(lambda x: (x-130)%7+1)
test['week'] = test['date'].apply(lambda x: (x-130)%7+1)

In [29]:
playback = pd.read_csv('data/user_playback_data.csv')
playback = playback[playback['user_id'].isin(test['user_id'])]
user_list =set(playback['user_id'])

In [31]:
feats = ['playtime_last0', 'video_count_last0', 'playtime_last1', 'video_count_last1', 'playtime_last2', 'video_count_last2',
         'playtime_last3', 'video_count_last3', 'playtime_last4', 'video_count_last4', 'playtime_last5', 'video_count_last5',
         'playtime_last6', 'video_count_last6', 'playtime_last7', 'video_count_last7', 'device_type', 'sex','age', 'education',
         'occupation_status','device_ram_new','device_rom_new','diff_near','is_launch','launch_type_new',
         'launchNum','NumLastWeek','preds_median_30',
         'preds_mean_4','preds_mean_4_weighted','weighted_median','week']

In [32]:
len(feats)

33

In [33]:
cat_list = ['device_type','sex','age','education','occupation_status','week','is_launch','launch_type_new']

In [34]:
train_pos = train[train['label']>0]
train_neg = train[train['label']==0]
train_neg_new = train_neg.sample(frac=0.7,random_state=2).reset_index(drop=True)

In [35]:
train = pd.concat([train_pos, train_neg_new])
train = train.sample(frac=1,random_state=2).reset_index(drop=True)

In [36]:
for each_feat in cat_list:
    train[each_feat] = train[each_feat].fillna(0)
    test[each_feat] = test[each_feat].fillna(0)
    
    train[each_feat] = train[each_feat].astype(int)
    test[each_feat] = test[each_feat].astype(int)

In [37]:
for each_feat in cat_list:
    df_tmp = train.groupby(each_feat,as_index=False)['label'].median()
    dict_tmp = dict(zip(df_tmp[each_feat],df_tmp['label']))
    
    train[each_feat+'_label'] = train[each_feat].apply(lambda x: dict_tmp[x])
    test[each_feat+'_label'] = test[each_feat].apply(lambda x: dict_tmp[x])

In [38]:
cat_list_new = [x+'_label' for x in cat_list]

### LightGBM

In [39]:
clf = lgbm.LGBMRegressor( objective='regression',max_depth=5, num_leaves=32, learning_rate=0.01, n_estimators=2500
                         , reg_alpha=0.1,reg_lambda=0.1, random_state=2021, subsample = 0.8, min_child_samples=500)
clf.fit(train[feats],train['label'],categorical_feature=cat_list)
test['pre_lgb'] = clf.predict(test[feats])

In [40]:
test['pre_lgb'] = test['pre_lgb'].apply(lambda x: 0 if x<0 else x)
test['pre_lgb'] = test['pre_lgb'].apply(lambda x: 7 if x>7 else x)

### CatBoost

In [41]:
cbt = cat.CatBoostRegressor(
        iterations=500, learning_rate=0.1,
        depth=6, l2_leaf_reg=3,
        verbose=False,
        random_seed=2021)

cbt.fit(train[feats],train['label'],cat_features=cat_list)
test['pre_cat'] = cbt.predict(test[feats])

In [42]:
test['pre_cat'] = test['pre_cat'].apply(lambda x: 0 if x<0 else x)
test['pre_cat'] = test['pre_cat'].apply(lambda x: 7 if x>7 else x)

### XGBoost

In [43]:
new_feats = [x for x in feats+cat_list_new if x not in cat_list]

In [44]:
clf_xgb = xgb.XGBRegressor(max_depth=5,n_estimators=200,learning_rate=0.15,subsample=0.8,
                          reg_alpha=0.1,reg_lambda=0.2,base_score=0, min_child_weight=5,
                          )

In [45]:
clf_xgb.fit(train[new_feats],train['label'])
test['pre_xgb'] = clf_xgb.predict(test[new_feats])

In [46]:
test['pre_xgb'] = test['pre_xgb'].apply(lambda x: 0 if x<0 else x)
test['pre_xgb'] = test['pre_xgb'].apply(lambda x: 7 if x>7 else x)

In [47]:
res = pd.merge(test, df_group[['user_id','date_max']],how='left')
res['diff_date'] = res['date'] - res['date_max']

### tensorflow decision forest

In [48]:
tfdf = pd.read_csv('res/tfdf_gbt_online.csv')
tfdf.columns = ['user_id','pre_gbdt']
res = pd.merge(res, tfdf, how='left')
res['pre_gbdt'] = res['pre_gbdt'].apply(lambda x: 0 if x<0 else x)
res['pre_gbdt'] = res['pre_gbdt'].apply(lambda x: 7 if x>7 else x)

### Post-processing (set to 0 if no login activity for more than 30 days, also set to 0 if the predicted value is less than 0.5 and no video has been watched)

In [49]:
def revise_pre(row):
    if row['user_id'] not in user_list and row['pre_lgb']<0.5:
        return 0
    else:
        return row['pre_lgb']

res.loc[res['diff_date']>=30,'pre_lgb'] = 0

res['pre_lgb'] = res.apply(revise_pre, axis=1)

In [50]:
def revise_pre(row):
    if row['user_id'] not in user_list and row['pre_cat']<0.5:
        return 0
    else:
        return row['pre_cat']

res.loc[res['diff_date']>=30,'pre_cat'] = 0

res['pre_cat'] = res.apply(revise_pre, axis=1)

In [51]:
def revise_pre(row):
    if row['user_id'] not in user_list and row['pre_xgb']<0.5:
        return 0
    else:
        return row['pre_xgb']

res.loc[res['diff_date']>=30,'pre_xgb'] = 0

res['pre_xgb'] = res.apply(revise_pre, axis=1)

In [52]:
def revise_pre(row):
    if row['user_id'] not in user_list and row['pre_gbdt']<0.5:
        return 0
    else:
        return row['pre_gbdt']

res.loc[res['diff_date']>=30,'pre_gbdt'] = 0

res['pre_gbdt'] = res.apply(revise_pre, axis=1)

In [53]:
res['pre_avg'] = 4/( 1/(res['pre_xgb']+0.00001)  + 1/(res['pre_lgb']+0.00001)  + 1/(res['pre_cat']+0.00001) + 1/(res['pre_gbdt']+0.00001))

In [54]:
res['pre_avg'] = res['pre_avg'].apply(lambda x: 7 if x>6.5 else x)
res['pre_avg'] = res['pre_avg'].apply(lambda x: 0 if x<0.4 else x)

## 用CNN分类结果对标签0进行修正

In [55]:
zero = pd.read_csv('res/binary_zero.csv')
zero.columns = ['user_id','cnn_zero']
test = pd.merge(test, zero, how='left')
test['rank0'] = test['cnn_zero'].rank(ascending=False)
res_0 = test[test['rank0']<=4500]
userid_0 = res_0['user_id']
res.loc[res['user_id'].isin(userid_0),'pre_avg'] = 0

### Correct label 7 using the classification results of CNN

In [56]:
cnn = pd.read_csv('res/binary_seven.csv')
cnn.columns = ['user_id','cnn_binary']
test = pd.merge(test, cnn, how='left')
test['rank'] = test['cnn_binary'].rank(ascending=False)
res_7 = test.loc[test['rank']<=1200]
userid_7 = res_7['user_id']
res.loc[res['user_id'].isin(userid_7),'pre_avg'] = 7

In [57]:
res['pre_avg'] = res['pre_avg'].apply(lambda x: round(x,2))

In [58]:
res.loc[res['pre_avg']<0.5,'pre_avg'] = 0
res.loc[(res['pre_avg']>0.6)&(res['pre_avg']<1.4),'pre_avg'] = 1
res.loc[(res['pre_avg']>1.55)&(res['pre_avg']<2.4),'pre_avg'] = 2
res.loc[(res['pre_avg']>2.55)&(res['pre_avg']<3.4),'pre_avg'] = 3
res.loc[(res['pre_avg']>3.55)&(res['pre_avg']<4.4),'pre_avg'] = 4
res.loc[(res['pre_avg']>4.55)&(res['pre_avg']<5.2),'pre_avg'] = 5
res.loc[(res['pre_avg']>5.55)&(res['pre_avg']<6.2),'pre_avg'] = 6

In [59]:
res[['user_id','pre_avg']].to_csv('res/submission.csv',index=False, header=False, float_format="%.2f")