In [1]:
# coding:utf-8
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.optimizers import *
import tensorflow as tf
import os.path as osp
import warnings

warnings.filterwarnings('ignore')
path = "d:/data/user_persona"

# 读取数据，简单处理list数据
train = pd.read_csv(osp.join(path,'train.txt'), header=None)
test = pd.read_csv(osp.join(path,'test.txt'), header=None)

train.columns = ['pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make']
test.columns = ['pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make']

train['label'] = train['label'].astype(int)

data = pd.concat([train,test])
data['label'] = data['label'].fillna(-1)

data['tagid'] = data['tagid'].apply(lambda x:eval(x))
data['tagid'] = data['tagid'].apply(lambda x:[str(i) for i in x])

# 超参数
# embed_size  embedding sizez
# MAX_NB_WORDS  tagid中的单词出现次数
# MAX_SEQUENCE_LENGTH  输入tagid list的长度
embed_size = 64
MAX_NB_WORDS = 230637
MAX_SEQUENCE_LENGTH = 128 
# 训练word2vec，这里可以考虑elmo，bert等预训练
w2v_model = Word2Vec(sentences=data['tagid'].tolist(), vector_size=embed_size, window=5, min_count=1,epochs=10)
# 这里是划分训练集和测试数据
X_train = data[:train.shape[0]]['tagid']
X_test = data[train.shape[0]:]['tagid']

# 创建词典，利用了tf.keras的API，其实就是编码一下，具体可以看看API的使用方法
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
word_index = tokenizer.word_index
# 计算一共出现了多少个单词，其实MAX_NB_WORDS我直接就用了这个数据

nb_words = len(word_index) + 1
print('Total %s word vectors.' % nb_words)
# 构建一个embedding的矩阵，之后输入到模型使用
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    try:
        embedding_vector = w2v_model.wv.get_vector(word)
    except KeyError:
        continue
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

y_categorical = train['label'].values

def my_model():
    embedding_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    # 词嵌入（使用预训练的词向量）
    embedder = Embedding(nb_words,
                         embed_size,
                         input_length=MAX_SEQUENCE_LENGTH,
                         weights=[embedding_matrix],
                         trainable=False
                         )
    embed = embedder(embedding_input)
    l = LSTM(128)(embed)
    flat = BatchNormalization()(l)
    drop = Dropout(0.2)(flat)
    main_output = Dense(1, activation='sigmoid')(drop)
    model = Model(inputs=embedding_input, outputs=main_output)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer='adam', metrics=['accuracy'])
    return model



Total 230638 word vectors.


In [2]:
X_train.shape

(300000, 128)

In [2]:
import numpy as np
a = np.array([1,2,3,4,5])
a[[1,2]]

array([2, 3])

In [6]:
# 五折交叉验证
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=2019)
oof = np.zeros([len(train), 1])
predictions = np.zeros([len(test), 1])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
    print("fold n{}".format(fold_ + 1))
    model = my_model()
    if fold_ == 0:
        model.summary()

    early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
    bst_model_path = "./models/{}.h5".format(fold_)
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

    X_tra, X_val = X_train[trn_idx], X_train[val_idx]
    y_tra, y_val = y_categorical[trn_idx], y_categorical[val_idx]

    model.fit(X_tra, y_tra,
              validation_data=(X_val, y_val),
              epochs=12, batch_size=512, shuffle=True,
              callbacks=[early_stopping, model_checkpoint])

    model.load_weights(bst_model_path)

    oof[val_idx] = model.predict(X_val)

    predictions += model.predict(X_test) / folds.n_splits
    print(predictions)
    del model

train['predict'] = oof
train['rank'] = train['predict'].rank()
train['p'] = 1
train.loc[train['rank'] <= train.shape[0] * 0.5, 'p'] = 0
bst_f1_tmp = f1_score(train['label'].values, train['p'].values)
print(bst_f1_tmp)

submit = test[['pid']]
submit['tmp'] = predictions
submit.columns = ['user_id', 'tmp']

submit['rank'] = submit['tmp'].rank()
submit['category_id'] = 1
submit.loc[submit['rank'] <= int(submit.shape[0] * 0.5), 'category_id'] = 0

print(submit['category_id'].mean())

submit[['user_id', 'category_id']].to_csv('submit/lstm_{}.csv'.format(str(bst_f1_tmp).split('.')[1]), index=False)

fold n1
Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 128, 64)           14760832  
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               98816     
_________________________________________________________________
batch_normalization_2 (Batch (None, 128)               512       
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 14,860,289
Trainable params: 99,201
Non-trainable params: 14,761,088
_____________________________

In [11]:
train['predict'].describe()

count    300000.000000
mean          0.506396
std           0.255147
min           0.002079
25%           0.303349
50%           0.524800
75%           0.711148
max           0.996728
Name: predict, dtype: float64

In [12]:
for i in range(20,70):
    print("*"*70)
    print("*"*70)
    bound = i*1.0/100
    train['p'] = train['predict'].apply(lambda x:1 if x>=bound else 0)
    bst_f1_tmp = f1_score(train['label'].values, train['p'].values)
    print("{}---{}".format(bound,bst_f1_tmp))

**********************************************************************
**********************************************************************
0.2---0.7169251677735838
**********************************************************************
**********************************************************************
0.21---0.7186879943294839
**********************************************************************
**********************************************************************
0.22---0.7204210843676196
**********************************************************************
**********************************************************************
0.23---0.7220048609126476
**********************************************************************
**********************************************************************
0.24---0.7236387153281698
**********************************************************************
**********************************************************************
0.25---0.7250349922

0.69---0.5668577397439136


In [8]:
submit['tmp'].describe()

count    100000.000000
mean          0.479970
std           0.241401
min           0.003600
25%           0.289518
50%           0.484756
75%           0.669143
max           0.993465
Name: tmp, dtype: float64

In [13]:
bst_f1_tmp = 0.7322617694146036
submit = test[['pid']]
submit['tmp'] = predictions
submit.columns = ['user_id', 'tmp']

submit['category_id'] = submit['tmp'].apply(lambda x:1 if x>=0.4 else 0)

print(submit['category_id'].mean())

submit[['user_id', 'category_id']].to_csv('submit/lstm_{}.csv'.format(str(bst_f1_tmp).split('.')[1]), index=False)

0.61362


In [17]:
# for bound in range(45,60):
#     bound = bound*1.0/100
#     submit = test[['pid']]
#     submit['tmp'] = predictions
#     submit.columns = ['user_id', 'tmp']

#     submit['category_id'] = submit['tmp'].apply(lambda x:1 if x>=bound else 0)

#     print(submit['category_id'].mean())

#     submit[['user_id', 'category_id']].to_csv('submit/lstm_{}.csv'.format(str(bound).split('.')[1]), index=False)

0.54721
0.53335
0.51988
0.50628
0.49287
0.47896
0.46549
0.45106
0.43738
0.42385
0.41077
0.39692
0.3833
0.369
0.35486
