In [1]:
import pandas as pd
import pyarrow.parquet as pq
def load_data(file_path):
    table = pq.read_table(file_path)
    df = table.to_pandas()
    return df

import numpy as np
from collections import defaultdict
from sklearn.metrics import roc_auc_score


def gauc(labels, preds, uids):
    """Calculate group auc
    :param labels: list
    :param predict: list
    :param uids: list
    >>> gauc([1,1,0,0,1], [0, 0,1,0,1], ['a', 'a','a', 'b', 'b'])
    0.4
    >>> gauc([1,1,0,0,1], [1,1,0,0,1], ['a', 'a','a', 'b', 'b'])
    1.0
    >>> gauc([1,1,1,0,0], [1,1,0,0,1], ['a', 'a','a', 'b', 'b'])
    0.0
    >>> gauc([1,1,1,0,1], [1,1,0,0,1], ['a', 'a','a', 'b', 'b'])
    1.0
    """
    assert len(uids) == len(labels)
    assert len(uids) == len(preds)
    group_score = defaultdict(lambda: [])
    group_truth = defaultdict(lambda: [])
    for idx, truth in enumerate(labels):
        uid = uids[idx]
        group_score[uid].append(preds[idx])
        group_truth[uid].append(truth)

    total_auc = 0
    impression_total = 0
    for user_id in group_truth:
        if label_with_xor(group_truth[user_id]):
            auc = roc_auc_score(np.asarray(
                group_truth[user_id]), np.asarray(group_score[user_id]))
            total_auc += auc * len(group_truth[user_id])
            impression_total += len(group_truth[user_id])
    group_auc = (float(total_auc) /
                 impression_total) if impression_total else 0
    group_auc = round(group_auc, 6)
    return group_auc


def label_with_xor(lists):
    """
    >>> label_with_xor([1,1,1])
    False
    >>> label_with_xor([0,0,0])
    False
    >>> label_with_xor([0,])
    False
    >>> label_with_xor([1,])
    False
    >>> label_with_xor([0,1])
    True
    """
    if not lists:
        return False
    first = lists[0]
    for i in range(1, len(lists)):
        if lists[i] != first:
            return True
    return False


def auc(y_true, y_score):
    '''
    :param y_true: shape = [n_samples] or [n_samples, n_classes]
    :param y_score: shape = [n_samples] or [n_samples, n_classes]
    :return:
    '''
    return roc_auc_score(y_true, y_score)

import gc
def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

In [2]:
# 类别变量的统计特征
def get_count_feat(data,cols,feature):
    for col in cols:
        if not isinstance(col,list):
            data['count_{}'.format(col)] = data.groupby(col)[col].transform('count')
            feature.append('count_{}'.format(col))
        else:
            data['count_{}'.format('_'.join(col))] = data.groupby(col)['index'].transform('count')
            feature.append('count_{}'.format('_'.join(col)))
    return data,feature

In [3]:
def reshape_user_info(data):
    did_list = []
    time_list = []
    vid_list = []
    for ii in data.values:
        time_vid = ii[1].replace('[','').replace(']','')
        time_vid = time_vid.split(',')
        if len(time_vid) == 1:
            pass
        else:
            for k,s in enumerate(time_vid):
                if (k+1) % 2 == 1:
                    did_list.append(ii[0].strip())
                    time_list.append(s.strip())
                else:
                    vid_list.append(s.strip())
    data = pd.DataFrame()
    data['did'] = did_list
    data['vid'] = vid_list
    data['timestamp'] = time_list
    return data

In [4]:
%mkdir -p ../input/all/w2v

In [5]:
from gensim.models import Word2Vec
import os
import numpy as np
def get_word2vec_feature(seq,emb,feat,ikx,ext='',feature=[]):
    sentence = [[str(x) for x in x] for x in seq]
    if os.path.exists('../input/all/w2v/w2v_model_{}_{}.model'.format('_'.join(feat),ext)):
        model = Word2Vec.load('../input/all/w2v/w2v_model_{}_{}.model'.format('_'.join(feat),ext))
    else:
        model = Word2Vec(sentence, size=emb, window=5, min_count=1, workers=10, iter=10, sg=1, seed=42)
        model.save('../input/all/w2v/w2v_model_{}_{}.model'.format('_'.join(feat),ext))
#     print('make emb_dict ing')
#     emb_dict = {}
#     for sent in sentence:
#         vec = []
#         for w in sent:
#             if w in model:
#                 emb_dict[w] = model[w]
#     print('make emb_martix ing',len(emb_dict))
#     emb_martix = []
#     index = []
#     idx = 0
#     for v in emb_dict:
#         if v not in index:
#             if idx == 0: 
#                 print(v,emb_dict[v])
#             idx = idx + 1
#             index.append(v)
#             emb_martix.append(np.array(emb_dict[v]))
#     emb_martix = np.array(emb_martix)
#     print('make data dataframe ing')
#     data = pd.DataFrame()
#     data[ikx] = index
#     for i in range(emb):
#         data['{}_emb_{}_{}'.format(ikx, i, ext)] = emb_martix[:,i]
#         feature.append('{}_emb_{}_{}'.format(ikx, i, ext))
    return model

In [6]:
def make_seq(data,col1,col2):
    tmp = data.groupby(col1)[col2].apply(lambda x:list(x)).reset_index()
    sentences = tmp[col2].values.tolist()
    return sentences

In [7]:
def get_watch_time(data,feature):
    data['timestamp'] = data['timestamp'].astype(int)
    data = data.sort_values('timestamp')
    tmp_data = pd.DataFrame()
    for i in [1,2,3,4,5]:
        data['time_{}'.format(i)] = data.groupby('did')['timestamp'].shift(i)
        data['time_{}'.format(i)] = data['timestamp'] - data['time_{}'.format(i)]
        
#         data['time_-{}'.format(i)] = data.groupby('did')['timestamp'].shift(-i)
#         data['time_-{}'.format(i)] = -data['time_-{}'.format(i)] + data['timestamp']
    
        tmp = data.groupby('did')['time_{}'.format(i)].mean().reset_index()
        
        if i == 1:
            tmp_data = tmp
        else:
            tmp_data = pd.merge(tmp_data,tmp,on=['did'],how='left',copy=False)
        feature.append('time_{}'.format(i))
#         feature.append('time_-{}_mean'.format(i))
    return tmp_data,feature

In [8]:
def get_embedding_static_feature(data,col,static_feature,feature,log):
    tmp_1 = data.groupby(col)[static_feature].mean().add_suffix('_mean').reset_index()
    tmp_2 = data.groupby(col)[static_feature].median().add_suffix('_median').reset_index()
    if log == True:
        for f in tmp_1.columns:
            if f not in ['did']:
                feature.append(f)
        for f in tmp_2.columns:
            if f not in ['did']:
                feature.append(f)
    tmp = pd.merge(tmp_1,tmp_2,on=[col],how='outer',copy=False)
    return pd.merge(data,tmp,on=[col],how='outer',copy=False),feature

In [9]:
import time
def get_time(timeStamp):
    timeArray = time.localtime(timeStamp)
    return time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

In [10]:
def get_new_date(data):
    tmp_timestamp_first = data.groupby(['did'])['timestamp'].min().reset_index()
    del data['timestamp']
    data = pd.merge(data,tmp_timestamp_first,on=['did'],how='left',copy=False)
    return data

In [11]:
%mkdir -p ../input/all/w2v

In [12]:
from gensim.models import Word2Vec
import os
import numpy as np
def get_w2v_feature(data,col1,col2,emb_size,ext='',feature=[]):
    print('begin train word2vec')
    data = data[col1 +[col2]]
    data[col2] = data[col2].astype(str)
    tmp = data.groupby(col1)[col2].apply(lambda x:list(x)).reset_index()
    sentences = tmp[col2].values.tolist()
    print(tmp.head())
    del tmp[col2]
    if os.path.exists('../input/all/w2v/{}_{}_feature{}.model'.format('_'.join(col1),col2,ext)):
        model = Word2Vec.load('../input/all/w2v/{}_{}_feature{}.model'.format('_'.join(col1),col2,ext))
    else:
        model = Word2Vec(sentences, size=emb_size, window=10, min_count=1, sg=1, seed=42,iter=10)
        model.save('../input/all/w2v/{}_{}_feature{}.model'.format('_'.join(col1),col2,ext))
    emb_matrix = []
    emb_dict = {}
    print('begin make feature')
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
                emb_dict[w] = model[w]
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)
    emb_matrix = np.array(emb_matrix)
    for i in range(emb_size):
        tmp['{}_{}_emb_{}{}'.format('_'.join(col1), col2, i, ext)] = emb_matrix[:, i]
        feature.append('{}_{}_emb_{}{}'.format('_'.join(col1), col2, i,ext))
    del model, emb_matrix, sentences
    new_emb_martix = []
    data_index = []
    for v in emb_dict:
        data_index.append(v)
        tmp_emb = np.array(emb_dict[v])
        new_emb_martix.append(tmp_emb)
    new_emb_martix = np.array(new_emb_martix)
    data = pd.DataFrame()
    data[col2] = data_index
    for i in range(emb_size):
        data['{}_emb_{}_{}'.format(col2, i, ext)] = new_emb_martix[:,i]
        feature.append('{}_emb_{}_{}'.format(col2, i, ext))
    return tmp,feature,data

In [13]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
test_path = '../input/eval/'
train_path = '../input/train/part_'
valid_path = '../input/train/part_30/'

In [15]:
print('train data')
train = pd.DataFrame()
train_item = pd.DataFrame()
for i in range(29,30):
    tmp = load_data(train_path+ '{}/'.format(i) + 'context.parquet')
    tmp_item = load_data(train_path+ '{}/'.format(i) + 'item.parquet')
    tmp_item.rename(columns={'timestamp':'item_timestamp'},inplace=True)
    tmp = pd.merge(tmp,tmp_item,on=['vid'],how='left',copy=False)
    train = pd.concat([train,tmp],axis=0,sort=False)
    del tmp,tmp_item

train data


In [16]:
print('test data')
test = load_data(test_path  + 'context.parquet')
test_item = load_data(test_path  + 'item.parquet')
test_item.rename(columns={'timestamp':'item_timestamp'},inplace=True)
test = pd.merge(test,test_item,on=['vid'],how='left',copy=False)

test data


In [17]:
print('valid data')
valid = load_data(valid_path + 'context.parquet')
valid_item = load_data(valid_path + 'item.parquet')
valid_item.rename(columns={'timestamp':'item_timestamp'},inplace=True)
valid = pd.merge(valid,valid_item,on=['vid'],how='left',copy=False)

valid data


In [18]:
train = get_new_date(train)
valid = get_new_date(valid)
test = get_new_date(test)

In [19]:
train['date'] = train['timestamp'].apply(lambda x:get_time(x))
valid['date'] = valid['timestamp'].apply(lambda x:get_time(x))
test['date'] = test['timestamp'].apply(lambda x:get_time(x))

In [20]:
import datetime
train['date'] = pd.to_datetime(train['date']) + datetime.timedelta(hours=8)
valid['date'] = pd.to_datetime(valid['date']) + datetime.timedelta(hours=8)
test['date'] = pd.to_datetime(test['date']) + datetime.timedelta(hours=8)

In [21]:
train['day'] = train['date'].dt.day
valid['day'] = valid['date'].dt.day
test['day'] = test['date'].dt.day

In [22]:
train['hour'] = train['date'].dt.hour
valid['hour'] = valid['date'].dt.hour
test['hour'] = test['date'].dt.hour

In [23]:
data = pd.concat([train,valid,test],axis=0,ignore_index=True,sort=False)

In [24]:
del data['date']

In [25]:
data['stars'] = data['stars'].apply(list)

In [26]:
feature = []

In [27]:
model = get_word2vec_feature(data['stars'].values.tolist(),8,['did','stars'],'stars',ext='8',feature=[])

In [28]:
from tqdm import tqdm

In [None]:
emb_matrix = []
for col in tqdm(data['stars'].values):
    tmp = np.zeros(shape=(8))
    for seq in col:
        tmp = tmp + model[str(seq)] / len(col)
    emb_matrix.append(tmp)
emb_matrix = np.array(emb_matrix)

 28%|██▊       | 3265576/11844346 [04:19<10:48, 13224.70it/s]

In [None]:
for i in range(8):
    data['{}_{}_{}'.format('did','stars',i)] = emb_matrix[:,i]
    feature.append('{}_{}_{}'.format('did','stars',i))

In [None]:
del data['stars']

In [None]:
total_feature_1,feature,total_feature_2 = get_w2v_feature(data,['did'],'vid',8,ext='8',feature=feature)
# total_feature_1 = reduce_mem(total_feature_1)
total_feature_2 = reduce_mem(total_feature_2)

In [None]:
total_feature_1

In [None]:
total_feature_2

In [None]:
total_feature_2['vid'] = total_feature_2['vid'].astype(int)

In [None]:
total_feature_2

In [None]:
# data = pd.merge(data,total_feature_1,how='left',on=['did'],copy=False)
data = pd.merge(data,total_feature_2,how='left',on=['vid'],copy=False)

In [None]:
col = ['mod', 'mf', 'aver', 'sver', 'did','vid', 'prev', 'region','cid', 'class_id','is_intact', 'second_class',
#       ['hour','vid'],['day','vid'],
#       ['hour','cid'],['day','cid'],
      ]
data,feature = get_count_feat(data,col,feature)

In [None]:
data

In [None]:
for col in [ 'vid', 'prev','region', 'title_length', 'item_timestamp', 'cid', 'class_id','is_intact', 'second_class', 'duration', 'ctr', 'vv','hour']:
    feature.append(col)

In [None]:
data['timestamp_item_timestamp'] = data['timestamp'] = data['item_timestamp']

In [None]:
for col in ['timestamp_item_timestamp']:
    feature.append(col)

In [None]:
featureture

In [None]:
# for cross_feat in [['did','vid'],['did','cid'],['did','class_id'],['did','is_intact'],['did','second_class']]:
#     tmp = data.groupby(cross_feat[0])[cross_feat[1]].nunique().reset_index().rename(columns={cross_feat[1]:'{}_{}_nunique'.format(cross_feat[0],cross_feat[1])})
#     data = pd.merge(data,tmp,on=cross_feat[0],how='left',copy=False)
#     feature.append('{}_{}_nunique'.format(cross_feat[0],cross_feat[1]))
    
# #     tmp = data.groupby(cross_feat[1])[cross_feat[0]].nunique().reset_index().rename(columns={cross_feat[0]:'{}_{}_nunique'.format(cross_feat[1],cross_feat[0])})
# #     data = pd.merge(data,tmp,on=cross_feat[1],how='left',copy=False)
# #     feature.append('{}_{}_nunique'.format(cross_feat[1],cross_feat[0]))
    
#     del tmp

In [None]:
for cat_f in ['mod', 'mf', 'aver', 'sver']:
    data[cat_f] = data[cat_f].astype("category")
    data[cat_f] = data[cat_f].cat.codes
    feature.append(cat_f)

In [None]:
train = data[:train.shape[0]]
valid = data[train.shape[0]:(train.shape[0] + valid.shape[0])]
test = data[-test.shape[0]:]

In [None]:
train.head()

In [None]:
feature

In [None]:
target = 'label'

In [None]:
import lightgbm as lgb
params = {'num_leaves': 32, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.1,
          "boosting": "gbdt",
          "verbosity": 1,
          'two_round':'true',
          "nthread": -1,               
          'metric': {'auc'},  
          "random_state": 42, 
          }
trn_data = lgb.Dataset(train[feature].values, label=train[target].values)
val_data = lgb.Dataset(valid[feature].values, label=valid[target].values)

clf = lgb.train(params,
                trn_data,
                500,
                valid_sets=[trn_data, val_data],
                verbose_eval=50,
                early_stopping_rounds=50)

In [None]:
valid_predict = clf.predict(valid[feature].values, num_iteration=clf.best_iteration)
xx_gauc = gauc(list(valid[target].values), list(valid_predict), list(valid['did'].values))
print(xx_gauc)
# 0.604091
# 0.631716
# 0.637493
# 0.661686

In [None]:
print(clf.feature_importance())
print(feature)

In [None]:
for_train = pd.concat([train,valid],axis=0,ignore_index=True,sort=False)
del train,valid,trn_data,val_data

In [None]:
trn_data = lgb.Dataset(for_train[feature].values, label=for_train[target].values)
for_clf = lgb.train(params,
                trn_data,
                clf.best_iteration,
                valid_sets=[trn_data],
                verbose_eval=50,
                early_stopping_rounds=50)

In [None]:
for_predict = for_clf.predict(test[feature].values, num_iteration=for_clf.best_iteration)

In [None]:
submit = test[['index']]
submit['score'] = for_predict
submit.to_csv('./submit_{}.csv'.format(str(xx_gauc).split('.')[1]),index=False)