In [None]:
import os
import re
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler, QuantileTransformer

In [None]:
def reduce_mem_usage(df, verbose=True):
    '''自定义用来降低内存空间的函数'''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train_path = '../init_data/toUserA/train.csv'
# test_path = '../init_data/toUserA/testa_nolabel.csv' # A榜测试集
test_path = '../init_data/toUserB/testb_nolabel.csv'   # B榜测试集
save_path = '../temp_data/'

test = pd.read_csv(test_path)
test.drop(['ID'], axis=1, inplace=True)
print(test.shape)

test_user_lst = test['userid'].unique().tolist()
test_video_lst = test['videoid'].unique().tolist()

# ====数据预处理=====

In [None]:
col = 'userid'
print(col)
df_train = pd.read_csv(train_path, usecols=[col,'videoid'], chunksize=10000000)
i = 0
new_train = pd.DataFrame()
for chunk in df_train:
    i += 1
    chunk = chunk[chunk['videoid'].isin(test_video_lst)][[col]]
    new_train = new_train.append(chunk)
    print(i, chunk.shape, new_train.shape)
df_train = new_train
print('===', df_train.shape, '===')
del chunk; gc.collect()

df_train.to_pickle(save_path+f'train_{col}.pkl')
print(f'save train_{col} ok!')
del df_train; gc.collect()

In [None]:
col = 'tag'
print(col)
df_train = pd.read_csv(train_path, usecols=[col,'videoid'], chunksize=10000000)
i = 0
new_train = pd.DataFrame()
for chunk in df_train:
    i += 1
    chunk = chunk[chunk['videoid'].isin(test_video_lst)][[col]]
    new_train = new_train.append(chunk)
    print(i, chunk.shape, new_train.shape)
df_train = new_train
print('===', df_train.shape, '===')
del chunk; gc.collect()

df_train.to_pickle(save_path+f'train_{col}.pkl')
print(f'save train_{col} ok!')
del df_train; gc.collect()

In [None]:
col = 'videoid'
print(col)
df_train = pd.read_csv(train_path, usecols=[col], chunksize=10000000)
i = 0
new_train = pd.DataFrame()
for chunk in df_train:
    i += 1
    chunk = chunk[chunk['videoid'].isin(test_video_lst)]
    new_train = new_train.append(chunk)
    print(i, chunk.shape, new_train.shape)
df_train = new_train
print('===', df_train.shape, '===')
del chunk; gc.collect()

df_train.to_pickle(save_path+f'train_{col}.pkl')
print(f'save train_{col} ok!')
del df_train; gc.collect()

In [None]:
dtype = {
    'videoid':'category',
    'is_like': 'int8', 
    'is_favourite': 'int8', 
    'is_share': 'int8', 
    'is_finish': 'int8',
}
label_cols = ['is_like','is_favourite','is_share','is_finish',]
col = 'feedback'
print(col)
df_train = pd.read_csv(train_path, usecols=label_cols+['videoid'], dtype=dtype, chunksize=10000000)
i = 0
new_train = pd.DataFrame()
for chunk in df_train:
    i += 1
    chunk = chunk[chunk['videoid'].isin(test_video_lst)]
    new_train = new_train.append(chunk)
    print(i, chunk.shape, new_train.shape)
df_train = new_train[label_cols]
print('===', df_train.shape, '===')
del chunk; gc.collect()

df_train.to_pickle(save_path+f'train_{col}.pkl')
print(f'save train_{col} ok!')
del df_train; gc.collect()

## 合并train

In [None]:
df_train = pd.DataFrame()
for col in ['userid', 'videoid', 'tag', 'feedback']:
    print(col)
    df_train = pd.concat([df_train, pd.read_pickle(save_path+f'train_{col}.pkl')], axis=1)

df_train = df_train[df_train['userid'].isin(test_user_lst)]
df_train.reset_index(drop=True, inplace=True)
print(df_train.shape)

df_train.to_pickle(save_path+'df_train_sp.pkl')
print('save train ok!')

del df_train
gc.collect()

# label encode

In [None]:
# 读取筛选后的数据
train = pd.read_pickle(save_path+'df_train_sp.pkl')
print(train.shape)

# label encoding
for col in ['userid','videoid','tag']:
    print(f'{col}: label encoding')
    le = LabelEncoder()
    le.fit(test[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

# 减少内存 保存处理好后的 train & test
train = reduce_mem_usage(train)
train.to_pickle(save_path+'df_train_sp.pkl')
print('save train ok!')

test = reduce_mem_usage(test)
test.to_pickle(save_path+'df_test_sp.pkl')
print('save test ok!')

In [None]:
# # 种类数
# train[['userid','videoid','tag']].nunique()
# userid      19274
# videoid    222483
# tag            25

# # 缺失值 tag 836972
# train.isnull().sum()

# # 点击率
# print(train['is_like'].mean())      # 0.01349
# print(train['is_favourite'].mean()) # 0.00219
# print(train['is_share'].mean())     # 0.00028
# print(train['is_finish'].mean())    # 0.23156