In [1]:
import pandas as pd

In [6]:
user_path = '../data/u.user'

user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
df_user = pd.read_csv(user_path, sep='|', names=user_columns)

# 给age分段
df_user = df_user.drop(columns=['zip_code'])
df_user['age'] = pd.cut(df_user['age'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                        labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90',
                                '90-100'])

In [7]:
df_user.head()

Unnamed: 0,user_id,age,gender,occupation
0,1,20-30,M,technician
1,2,50-60,F,other
2,3,20-30,M,writer
3,4,20-30,M,technician
4,5,30-40,F,other


In [8]:
df_user = pd.get_dummies(df_user, columns=['gender', 'occupation', 'age'],prefix_sep='#')

In [9]:
df_user.head()

Unnamed: 0,user_id,gender#F,gender#M,occupation#administrator,occupation#artist,occupation#doctor,occupation#educator,occupation#engineer,occupation#entertainment,occupation#executive,...,age#0-10,age#10-20,age#20-30,age#30-40,age#40-50,age#50-60,age#60-70,age#70-80,age#80-90,age#90-100
0,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,4,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,5,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# one hot encoder ，将label转换成one hot形式
def onehot_encoder(labels,num_classes):
    # labels 是Series的数据格式
    enc = LabelEncoder()
    labels = enc.fit_transform(labels)
    labels = labels.astype(np.int32)
    batch_size = tf.size(labels)
    labels = tf.expand_dims(labels,1)
    indices = tf.expand_dims(tf.range(0,batch_size,1),1)
    concated = tf.concat(axis=1,values=[indices,labels])
    onehot_labels = tf.sparse_to_dense(concated,tf.stack([batch_size,num_classes]),1.0,0.0)
    # tensorflow 的数据格式是numpy
    with tf.Session() as sess:
        return sess.run(onehot_labels)

# item数据
def load_item(item_path):
    item_columns = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
            'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
            'Thriller', 'War', 'Western']
    df_item = pd.read_csv(item_path, sep='|', names=item_columns,encoding="latin-1")
    df_item = df_item.drop(columns=['title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown'])
    return df_item


# user数据
def load_user(user_path):
    user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
    df_user = pd.read_csv(user_path, sep='|', names=user_columns)

    # 给age分段
    df_user['age'] = pd.cut(df_user['age'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                            labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90',
                                    '90-100'])
    # one hot 向量
    df_user = pd.get_dummies(df_user, columns=['gender', 'occupation', 'age'],prefix_sep='#')
    df_user = df_user.drop(columns=['zip_code'])
    return df_user

# train数据
def load_train(train_path,item_path,user_path):
    df_item = load_item(item_path)
    df_user = load_user(user_path)
    train_columns = ['user_id', 'item_id', 'rating', 'timestamp']
    df_train = pd.read_csv(train_path, sep='\t', names=train_columns)

    # 将评分等于5的数据作为用户的点击数据，评分小于5分的数据作为用户的未点击数据，构造成一个而分类问题
    df_train['rating'] = df_train.rating.apply(lambda x: 1 if int(x) == 5 else 0)
    df_train = df_train.merge(df_user, on='user_id', how='left')
    df_train = df_train.merge(df_item, on='item_id', how='left')
    train_labels = onehot_encoder(df_train['rating'].astype(np.int32), 2)
    return df_train,train_labels

In [11]:
item_path = '../data/u.item'
user_path = '../data/u.user'
train_path = '../data/ua.base'
df_train, train_labels = load_train(train_path,item_path,user_path)

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


In [22]:
column_list = df_train.columns.tolist()

In [23]:
column_list

['user_id',
 'item_id',
 'rating',
 'timestamp',
 'gender#F',
 'gender#M',
 'occupation#administrator',
 'occupation#artist',
 'occupation#doctor',
 'occupation#educator',
 'occupation#engineer',
 'occupation#entertainment',
 'occupation#executive',
 'occupation#healthcare',
 'occupation#homemaker',
 'occupation#lawyer',
 'occupation#librarian',
 'occupation#marketing',
 'occupation#none',
 'occupation#other',
 'occupation#programmer',
 'occupation#retired',
 'occupation#salesman',
 'occupation#scientist',
 'occupation#student',
 'occupation#technician',
 'occupation#writer',
 'age#0-10',
 'age#10-20',
 'age#20-30',
 'age#30-40',
 'age#40-50',
 'age#50-60',
 'age#60-70',
 'age#70-80',
 'age#80-90',
 'age#90-100',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [24]:
'user_id'.split('#')

['user_id']

In [32]:
def get_feature2field_dict(column_list,prefix_sep='#'):
    '''
        column_list:dataframe的列名称列表
        prefex_sep:之前get_dummies的分割符号
        return:
            feature2field_dict
            field_list
    '''
    feature2field_dict = dict()
    # 获取field集合
    field_set = set()
    for col in column_list:
        if prefix_sep not in col:
            field_set.add(col)
        else:
            x_field = col.split(prefix_sep)[0]
            field_set.add(x_field)
    
    # 其中index代表标记        
    field_list = list(field_set)
    
    for i,col in enumerate(column_list):    
        if prefix_sep not in col:
            feature2field_dict[i] = field_list.index(col)
        else:
            field = col.split(prefix_sep)[0]
            feature2field_dict[i] = field_list.index(field)
    return feature2field_dict,field_list

In [33]:
feature2field_dict,field_list = get_feature2field_dict(column_list)

In [34]:
feature2field_dict

{0: 1,
 1: 5,
 2: 12,
 3: 2,
 4: 4,
 5: 4,
 6: 10,
 7: 10,
 8: 10,
 9: 10,
 10: 10,
 11: 10,
 12: 10,
 13: 10,
 14: 10,
 15: 10,
 16: 10,
 17: 10,
 18: 10,
 19: 10,
 20: 10,
 21: 10,
 22: 10,
 23: 10,
 24: 10,
 25: 10,
 26: 10,
 27: 19,
 28: 19,
 29: 19,
 30: 19,
 31: 19,
 32: 19,
 33: 19,
 34: 19,
 35: 19,
 36: 19,
 37: 14,
 38: 0,
 39: 7,
 40: 17,
 41: 21,
 42: 15,
 43: 23,
 44: 20,
 45: 9,
 46: 11,
 47: 24,
 48: 22,
 49: 16,
 50: 8,
 51: 3,
 52: 18,
 53: 13,
 54: 6}

In [35]:
field_list

['Adventure',
 'user_id',
 'timestamp',
 'Sci-Fi',
 'gender',
 'item_id',
 'Western',
 'Animation',
 'Romance',
 'Fantasy',
 'occupation',
 'Film-Noir',
 'rating',
 'War',
 'Action',
 'Crime',
 'Mystery',
 'Children',
 'Thriller',
 'age',
 'Drama',
 'Comedy',
 'Musical',
 'Documentary',
 'Horror']