#### 对整个集合按时间窗口进行划分，计算precision recall f1

时间窗
train_start_date = '2018-01-05'
train_end_date = '2018-01-30'

test_start_date = '2018-01-10'
test_end_date = '2018-02-07'

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Dropout, SpatialDropout1D, Activation, concatenate
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import ReLU, PReLU, LeakyReLU, ELU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.utils import plot_model
from sklearn.utils import shuffle

In [None]:
def load_movies_dataset():
    movie_data = pd.read_csv(data_dir + './dataset1/movie.csv')
    movie_data = movie_data.rename(columns = {'评分': "豆瓣网评分"})
    return movie_data

def load_user_and_ratings() :
    user_data = pd.read_csv(data_dir + './dataset1/user.csv')
    user_data['评论时间'] = pd.to_datetime(user_data['评论时间'])
    return user_data

def call_data_process(dfuserin,dfmoviein):
    ur1 = dfuserin.groupby(['用户ID']).评分.agg( {'user_rating_avg':np.mean ,
                                           'user_rating_max':np.max ,
                                           'user_rating_min':np.min ,
                                           'user_rating_median':np.median
                                          }).reset_index()

    dfuserin['用户评论次数_观看电影个数'] = dfuserin.groupby(['用户ID'])['评分'].transform('count')
    u_data = pd.merge(ur1,dfuserin,on='用户ID',how = 'inner')
    user_and_movies_df = pd.merge(u_data,dfmoviein,on=['电影名','类型'],how='inner')
    return user_and_movies_df

def get_all_real_datas_change():
    movies_df = load_movies_dataset()
    userdata = load_user_and_ratings()
    del userdata['用户名']
    all_df = call_data_process(userdata,movies_df).copy(deep=True)
    all_df = all_df.dropna(how='any', axis=0)
    return all_df


In [None]:
def gen_action(start_date, end_date):
    """
    产生指定时间区间的行为数据
    """
    if os.path.exists(action_cate8_path):
        actions = pickle.load(open(action_cate8_path, 'rb'))
    else:
        actions = get_all_real_datas_change()
    actions = actions[(actions.评论时间 >= start_date) & (actions.评论时间 < end_date)]
    del actions['主演']
    return actions

def gen_labels( start_date,  end_date):
    """
    产生交互日区间内的购买情况
    """
    dump_path = my_dir + '/cache/labels_%s_%s.pkl' % ( start_date,  end_date)
    if os.path.exists(dump_path):
        labels = pickle.load(open(dump_path, 'rb'))
    else:
        actions = gen_action( start_date,  end_date)
        actions['喜欢'] = actions['评分'].apply(lambda x : 1 if x>=6 else 0)
        labels = actions['喜欢'].copy()
        del actions['喜欢']
        pickle.dump(labels, open(dump_path, 'wb'))
    return labels


def gen_truth(act_start_date, act_end_date):
    """
    产生交互日区间内的实际购买情况
    """
    dump_path = my_dir + '/cache/truth_%s_%s.pkl' % (act_start_date, act_end_date)
    if os.path.exists(dump_path):
        actions = pickle.load(open(dump_path, 'rb'))
    else:
        actions = gen_action(act_start_date, act_end_date)
        actions['喜欢'] = actions['评分'].apply(lambda x: 1 if x >= 6 else 0)
        pickle.dump(actions, open(dump_path, 'wb'))
    return actions

def make_train_set(train_start_date, train_end_date ):
    """
    构造训练集
    """
    dump_path = my_dir + '/cache/train_set_%s_%s.pkl' % (train_start_date, train_end_date)
    print('make_train_set dump_path', dump_path)
    if os.path.exists(dump_path):
        train_set = pickle.load(open(dump_path, 'rb'))
    else:
        train_set = gen_action(train_start_date, train_end_date)
        label = gen_labels(train_start_date, train_end_date)
    return train_set ,label

def make_test_set(test_start_date, test_end_date):
    """
    构造测试集
    """
    dump_path =  my_dir +  '/cache/test_set_%s_%s.pkl' % (test_start_date, test_end_date)
    if os.path.exists(dump_path):
        test_set = pickle.load(open(dump_path, 'rb'))
    else:
        test_set = gen_action(test_start_date, test_end_date)
        pickle.dump(test_set, open(dump_path, 'wb'))

    index = test_set[['用户ID']].copy()
    return index, test_set

In [None]:
def preprocessing(all_real_data):
    # 缺失值删除
    all_real_data.dropna(how='any', axis=0)
    # 给数据
    # 标签2值化
    all_data = all_real_data.copy(deep=True)
    all_data[LABEL_COLUMN] = all_data['评分'].apply(lambda x: 1 if x>=6 else 0 )
    all_data.pop('评分')
    # 标签y
    y = all_data[LABEL_COLUMN].values
    # 数据X
    all_data.pop(LABEL_COLUMN)
    # 类别型的label encoding
    for c in CATEGORICAL_COLUMNS:
        le = LabelEncoder()
        all_data[c] = le.fit_transform(all_data[c])
    #时间序列数据——————》打乱数据？？？
    #随机打乱数据
    #all_data = shuffle(all_data)
    # 分别取出train和test的特征和标签
    # 打乱后取95%作为训练集 5%作为测试集
    train_size = int(all_data.shape[0])#*0.95
    x_train = all_data.iloc[:train_size]
    #print('x_train.shape',x_train.shape)
    y_train = y[:train_size]
    #x_test = all_data.iloc[train_size:]
    #y_test = y[train_size:]
    # 类别型的列
    x_train_categ = np.array(x_train[CATEGORICAL_COLUMNS])

    #x_test_categ = np.array(x_test[CATEGORICAL_COLUMNS])
    # 连续值的列
    x_train_conti = np.array(x_train[CONTINUOUS_COLUMNS], dtype='float64')
    #x_test_conti = np.array(x_test[CONTINUOUS_COLUMNS], dtype='float64')
    # 对连续值的列做幅度缩放
    scaler = StandardScaler()
    x_train_conti = scaler.fit_transform(x_train_conti)
    #x_test_conti = scaler.transform(x_test_conti)
    #return [x_train, y_train, x_test, y_test, x_train_categ, x_test_categ, x_train_conti, x_test_conti, all_data]
    return [x_train, y_train,   x_train_categ,  x_train_conti,  all_data]

class Wide_and_Deep:
    def __init__(self, data,mode='wide and deep'):
        self.data = data
        self.mode = mode
        x_train, y_train,  x_train_categ,  x_train_conti,  all_data \
            = preprocessing(self.data)
        self.x_train = x_train
        self.y_train = y_train
        #self.x_test = x_test
        #self.y_test = y_test
        self.x_train_categ = x_train_categ
        #self.x_test_categ = x_test_categ
        self.x_train_conti = x_train_conti
        #self.x_test_conti = x_test_conti
        self.all_data = all_data
        self.poly = PolynomialFeatures(degree=2, interaction_only=True)
        self.x_train_categ_poly = self.poly.fit_transform(x_train_categ)
        #self.x_test_categ_poly = self.poly.transform(x_test_categ)
        self.categ_inputs = None
        self.conti_input = None
        self.deep_component_outlayer = None
        self.logistic_input = None
        self.model = None

    # input_length: 输入序列的长度
    # dim -->  输入的不同的词的个数
    # embed_dim -->  输出维度
    # embed_i shape
    #  (None, 1, 2)
    # flatten_i shape
    #  (None, 2)
    def deep_component(self):
        # deep部分的组件
        categ_inputs = []
        categ_embeds = []
        # 对类别型的列做embedding
        count = 0
        for i in range(len(CATEGORICAL_COLUMNS)):
            input_i = Input(shape=(1,), dtype='int32')
            dim = len(np.unique(self.all_data[CATEGORICAL_COLUMNS[i]]))
            embed_dim = int(np.ceil(dim ** 0.25))
            embed_i = Embedding(dim, embed_dim, input_length=1)(input_i)
            flatten_i = Flatten()(embed_i)
            categ_inputs.append(input_i)
            categ_embeds.append(flatten_i)
        # 连续值的列
        conti_input = Input(shape=(len(CONTINUOUS_COLUMNS),))
        conti_dense = Dense(256, use_bias=False)(conti_input)
        # 拼接类别型的embedding和连续值特征
        concat_embeds = concatenate([conti_dense] + categ_embeds)
        # 激活层与BN层
        concat_embeds = Activation('relu')(concat_embeds)
        bn_concat = BatchNormalization()(concat_embeds)
        # 全连接+激活层+BN层
        fc1 = Dense(512, use_bias=False)(bn_concat)
        ac1 = ReLU()(fc1)
        bn1 = BatchNormalization()(ac1)
        fc2 = Dense(256, use_bias=False)(bn1)
        ac2 = ReLU()(fc2)
        bn2 = BatchNormalization()(ac2)
        fc3 = Dense(128)(bn2)
        ac3 = ReLU()(fc3)

        self.categ_inputs = categ_inputs
        self.conti_input = conti_input
        self.deep_component_outlayer = ac3

    # self.x_train_categ_poly.shape[1] --> 37
    def wide_component(self):
        # wide部分的组件
        dim = self.x_train_categ_poly.shape[1]
        self.logistic_input = Input(shape=(dim,))

    # X           *   W      = Y
    # (None, 165) * (165,1)  = (None,1)
    def create_model(self):
        # wide+deep
        self.deep_component()
        self.wide_component()
        if self.mode == 'wide and deep':
            out_layer = concatenate([self.deep_component_outlayer, self.logistic_input])
            inputs = [self.conti_input] + self.categ_inputs + [self.logistic_input]
        elif self.mode == 'deep':
            out_layer = self.deep_component_outlayer
            inputs = [self.conti_input] + self.categ_inputs
        else:
            print('wrong mode')
            return

        output = Dense(1, activation='sigmoid')(out_layer)
        self.model = Model(inputs=inputs, outputs=output)

    # 训练
    # x: 训练数据的 Numpy 数组
    # y: 目标（标签）数据的 Numpy 数组
    # self.model.fit(x=None, y=None,epochs=epochs, batch_size=batch_size)
    def train_model(self, epochs=2, optimizer='adam', batch_size=128):
        # 不同结构的训练

        # 没有model的情况
        if not self.model:
            print('You have to create model first')
            return

        # 使用wide&deep的情况
        if self.mode == 'wide and deep':
            print('type wide and deep is \n', type(self.x_train_categ))
            input_data = [self.x_train_conti] + \
                         [self.x_train_categ[:, i] for i in range(self.x_train_categ.shape[1])] + \
                         [self.x_train_categ_poly]
        # 只使用deep的情况
        elif self.mode == 'deep':
            input_data = [self.x_train_conti] + \
                         [self.x_train_categ[:, i] for i in range(self.x_train_categ.shape[1])]
        else:
            print('wrong mode')
            return

        self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', ])
        self.model.fit(input_data, self.y_train, epochs=epochs, batch_size=batch_size)

    # 评估
    def evaluate_model(self):
        # if not self.model:
        #     print('You have to create model first')
        #     return
        #
        # if self.mode == 'wide and deep':
        #     input_data = [self.x_test_conti] + \
        #                  [self.x_test_categ[:, i] for i in range(self.x_test_categ.shape[1])] + \
        #                  [self.x_test_categ_poly]
        #     print('input_data len', len(input_data))
        # elif self.mode == 'deep':
        #     input_data = [self.x_test_conti] + \
        #                  [self.x_test_categ[:, i] for i in range(self.x_test_categ.shape[1])]
        # else:
        #     print('wrong mode')
        #     return
        #
        # loss, acc = self.model.evaluate(input_data, self.y_test)
        # print(f'test_loss: {loss} - test_acc: {acc}')
        pass

    def save_model(self, filename='wide_and_deep.h5'):
        self.model.save(filename)

    # 预测
    def predict_model(self):
        if not self.model:
            print('You have to create model first')
            return

        if self.mode == 'wide and deep':
            input_data = [self.x_all_conti] + \
                         [self.x_all_categ[:, i] for i in range(self.x_all_categ.shape[1])] + \
                         [self.x_all_categ_poly]
        elif self.mode == 'deep':
            input_data = [self.x_all_conti] + \
                         [self.x_all_categ[:, i] for i in range(self.x_all_categ.shape[1])]
        else:
            print('wrong mode')
            return

def preprocessing_rec(dfin):
    # 缺失值删除
    dfin.dropna(how='any', axis=0)
    # 标签2值化
    all_data = dfin.copy(deep=True)
    reverse_all_data = pd.DataFrame(columns = all_data.columns.to_list())
    # 类别型的label encoding
    for c in CATEGORICAL_COLUMNS:
        le = LabelEncoder()
        all_data[c] = le.fit_transform(all_data[c])
        reverse_all_data[c] = le.inverse_transform(all_data[c])
    # 类别型的列
    x_all_categ = np.array(all_data[CATEGORICAL_COLUMNS])
    # 连续值的列
    x_all_conti = np.array(all_data[CONTINUOUS_COLUMNS], dtype='float64')

    # 对连续值的列做幅度缩放
    scaler = StandardScaler()
    x_all_conti = scaler.fit_transform(x_all_conti)
    x_all_poly = PolynomialFeatures(degree=2, interaction_only=True)
    x_all_categ_poly = x_all_poly.fit_transform(x_all_categ)
    return [ x_all_categ,  x_all_conti,x_all_categ_poly, all_data,reverse_all_data]

def get_predictions(df_in):
    dump_path = my_dir + '/cache/predictions.pkl'
    if os.path.exists(dump_path):
        all_data = pickle.load(open(dump_path, 'rb'))
    else:
        x_all_categ,  x_all_conti,x_all_categ_poly,all_data,reverse_all_data = preprocessing_rec(df_in)
        input_data = [x_all_conti] +\
            [ x_all_categ[:, i] for i in range( x_all_categ.shape[1])] +\
            [ x_all_categ_poly]
        # firstly recover datas here
        #恢复数据的"类型", "特色", "导演","电影名"列
        all_data[["类型", "特色","地区", "导演","电影名"]] = reverse_all_data[["类型", "特色","地区", "导演","电影名"]].copy(deep=True)
        wide_deep_net = load_model('wide_and_deep.h5')
        predictions = wide_deep_net.predict(input_data)
        pda = pd.DataFrame(predictions, columns=['predictions'])
        all_data['喜欢'] = pda['predictions']
        all_data = all_data.dropna(how='any', axis=0)
        pickle.dump(all_data, open(dump_path, 'wb'))
    return all_data


In [None]:
def offline_test():
    train_start_date = '2018-01-05'
    train_end_date = '2018-01-30'

    test_start_date = '2018-01-10'
    test_end_date = '2018-02-07'

    train_X, train_Y = make_train_set(train_start_date, train_end_date)
    assert train_X.shape[0] == train_Y.shape[0]
    wide_deep_net = Wide_and_Deep(train_X)
    wide_deep_net.create_model()
    wide_deep_net.train_model()
    wide_deep_net.save_model()
    print('after training...')
    test_index, test_X = make_test_set(test_start_date, test_end_date)
    assert train_X.shape[1] == test_X.shape[1]
    #预测用户'2018-01-10'~'2018-02-07'会看的电影
    predict_data = get_predictions(test_X)
    #实际上用户'2018-01-10'~'2018-02-07'用户会看的电影
    truth = gen_truth(test_start_date, test_end_date)
    return truth,predict_data

def PrecisonRecall():
    print('------calculate recall&precision---------')
    dump_path_true = my_dir + '/cache/true.pkl'
    dump_path_predict = my_dir + '/cache/predict.pkl'
    if os.path.exists(dump_path_true) and os.path.exists(dump_path_predict):
        true = pickle.load(open(dump_path_true, 'rb'))
        predict = pickle.load(open(dump_path_predict, 'rb'))
    else:
        true, predict = offline_test()
        pickle.dump(true, open(dump_path_true, 'wb'))
        pickle.dump(predict, open(dump_path_predict, 'wb'))

    true = true[['用户ID','电影名']]
    true["用户真实喜欢的电影名"] = true['电影名']
    t1 = true.groupby('用户ID').电影名.size().reset_index()
    t1 = t1.rename(columns={'电影名': "用户实际观看电影个数"})
    true = pd.merge(true,t1,on='用户ID')
    predict = predict[['用户ID', '电影名']]
    predict["预测用户喜欢的电影名"] = predict['电影名']
    t2 = predict.groupby('用户ID').电影名.size().reset_index()
    t2 = t2.rename(columns={'电影名': "预测用户观看电影个数"})
    predict = pd.merge(predict, t2, on='用户ID')
    del true['电影名']
    del predict['电影名']
    true.to_csv('./true.csv', index=False, index_label=False)
    predict = predict.dropna(how='any', axis=0)
    predict.to_csv('./predict.csv', index=False, index_label=False)
    hit = 0
    predict_watched = 0
    real_watched = 0
    user_real_list = true.用户ID.unique()
    user_predict_list = predict.用户ID.unique()
    for i in user_predict_list:
        predict_movie_list = predict[predict.用户ID==i]['预测用户喜欢的电影名'].values
        for j in  user_real_list:
            #首先需要用户id相等
            if i == j:
                real_movie_list = true[true.用户ID==j]['用户真实喜欢的电影名'].values
                hit += len(list(set(real_movie_list).intersection(set(predict_movie_list))))
                predict_watched += len(list(set(predict_movie_list)))
                real_watched += len(list(set(real_movie_list)))

    precision =  hit / (1.0 * predict_watched)
    recall =  hit/(1.0* real_watched)
    f1 = 2*precision*recall/(precision+recall)
    return precision,recall,f1




In [None]:
COLUMNS = [
    '用户ID', 'user_rating_avg', 'user_rating_max', 'user_rating_min',
       'user_rating_median', '评分', '电影名', '用户评论次数_观看电影个数',
    '类型',  '地区',
       '导演', '特色', '豆瓣网评分'
]

LABEL_COLUMN = "喜欢"

CATEGORICAL_COLUMNS = [
    "类型", "特色","地区", "导演","电影名"
]

CONTINUOUS_COLUMNS = [
    "用户ID", "user_rating_avg", "user_rating_max", "user_rating_min",
    "user_rating_median","用户评论次数_观看电影个数","豆瓣网评分"
]

my_dir = 'D:/python/My_Project/'
data_dir = 'D:/python/Jupyter_Last_project/dataset/'
action_cate8_path = my_dir + '/cache/actions_train.pkl'
allr_df = get_all_real_datas_change().copy(deep=True)
offline_test()
p,r,f1 = PrecisonRecall()
print('p is\n',p)
print('r is\n',r)
print('f1 is\n',f1)