# 下载movielens-1M数据 安装依赖包

In [None]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip
! unzip -o ml-1m.zip

In [None]:
! pip install -q pyarrow

# 导入需要的库

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

# 读取数据

In [2]:
class MoiveLenDataLoader:
    def __init__(self, data_path="./"):
        self.data_path = data_path
        self.user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip']
        self.rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
        self.movie_cols = ['movie_id', 'title', 'genres']

    def load_users(self):
        """加载用户数据"""
        user_file = f"{self.data_path}ml-1m/users.dat"
        return pd.read_csv(user_file, sep='::', header=None, names=self.user_cols, engine='python')

    def load_ratings(self):
        """加载评分数据"""
        ratings_file = f"{self.data_path}ml-1m/ratings.dat"
        return pd.read_csv(ratings_file, sep='::', header=None, names=self.rating_cols, engine='python')

    def load_movies(self):
        """加载电影数据，并处理电影类型"""
        movies_file = f"{self.data_path}ml-1m/movies.dat"
        movies = pd.read_csv(movies_file, sep='::', header=None, names=self.movie_cols, encoding="unicode_escape", engine='python')
        movies['genres'] = movies['genres'].map(lambda x: x.split('|')[0])
        return movies

    def load_data(self):
        """加载并合并所有数据"""
        self.users = self.load_users()
        self.ratings = self.load_ratings()
        self.movies = self.load_movies()
        
        # 对3个表的数据执行inner join
        data = pd.merge(self.ratings, self.movies, on='movie_id')
        data = pd.merge(data, self.users, on='user_id')
        return data

# 使用示例
data_loader = MoiveLenDataLoader(data_path="./")
data = data_loader.load_data()
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation,F,1,10,48067


In [3]:
class MovieLenDataProcessor:

    @staticmethod
    def gen_data_set(data, test_ratio=0.1):
        """生成训练样本集"""   
        data.sort_values(by='timestamp', ascending=False, inplace=True)
        
        test_size = int(len(data) * test_ratio)
        
        test_df = data.iloc[:test_size]
        train_df = data.iloc[test_size:]
        
        train_df = train_df.sample(frac=1).reset_index(drop=True)  # shuffle the train data
        
        return train_df, test_df
    
    
  
    @staticmethod
    def label_encode_sparse_features(data, sparse_features):
        """对稀疏特征进行 Label Encoding 并生成特征最大索引值"""
        _data = data.copy()
        feature_max_idx = {}
        
        for feature in sparse_features:
            lbe = LabelEncoder()
            _data[feature] = lbe.fit_transform(_data[feature]) + 1
            feature_max_idx[feature] = _data[feature].max() + 1
        
        return _data, feature_max_idx

In [4]:
# 1. 稀疏特征标签化
sparse_features = ["movie_id", "user_id",
                    "gender", "age", "occupation", "zip", "genres"]
data_label_encode, feature_max_idx = MovieLenDataProcessor.label_encode_sparse_features(data, sparse_features)

In [18]:
# 2. 生成历史观看序列特征
from joblib import Parallel, delayed

def get_watch_seq(row, df, max_seq_len):
    user_data = df[df['timestamp'] < row['timestamp']]
    user_data.sort_values(by='timestamp', ascending=False, inplace=True)
    user_data = user_data.head(n=max_seq_len)
    
    movie_ids = list(user_data['movie_id'])
    genres = list(user_data['genres'])
    seq_len = len(user_data)
    return pd.Series([movie_ids, genres, seq_len])

def process_user_hist(user_hist_df):
    user_hist_df[['watch_movie_seq', "watch_genre_seq", "seq_len"]] = user_hist_df \
        .apply(lambda row: get_watch_seq(row, user_hist_df, max_seq_len=50), axis=1)
    return user_hist_df

# 对 user_id 进行分组
grouped = data_label_encode.groupby('user_id')

# 使用 joblib 并行处理每个用户的历史数据
results = Parallel(n_jobs=-1)(delayed(process_user_hist)(user_hist_df) for _, user_hist_df in tqdm(grouped))

# 合并结果
final_df = pd.concat(results)
final_df.head()

100%|██████████| 6040/6040 [02:45<00:00, 36.52it/s]


Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip,watch_movie_seq,watch_genre_seq,seq_len
0,1,1105,5,978300760,One Flew Over the Cuckoo's Nest (1975),8,1,1,11,1589,"[2600, 1118, 3178, 1659, 2148, 1179, 1575, 958...","[5, 8, 8, 8, 14, 5, 8, 3, 8]",9
1,1,640,3,978302109,James and the Giant Peach (1996),3,1,1,11,1589,"[1155, 2558, 1196, 2593, 854, 1026, 1839, 964,...","[8, 16, 1, 5, 12, 4, 3, 4, 8, 12, 8, 12, 8, 1,...",30
2,1,854,3,978301968,My Fair Lady (1964),12,1,1,11,1589,"[1026, 145, 1839, 964, 971, 1783, 878, 2890, 1...","[4, 8, 3, 4, 12, 8, 12, 8, 1, 8, 1, 6, 2, 1, 8...",25
3,1,3178,4,978300275,Erin Brockovich (2000),8,1,1,11,1589,"[1659, 2148, 1179, 1575, 958, 2970]","[8, 14, 5, 8, 3, 8]",6
4,1,2163,5,978824291,"Bug's Life, A (1998)",3,1,1,11,1589,"[582, 1, 575, 709, 2484, 514, 1422, 2206, 1108...","[3, 3, 3, 3, 3, 8, 8, 8, 1, 3, 5, 3, 5, 3, 4, ...",47


In [19]:
# 2. 训练集与测试集拆分
train_df, test_df = MovieLenDataProcessor.gen_data_set(final_df)

# 使用 pyarrow 将 DataFrame 保存为 Parquet 文件
train_df.to_parquet('ml1M-train.parquet', engine='pyarrow')
test_df.to_parquet('ml1M-test.parquet', engine='pyarrow')

In [20]:
# 5. 保存feature_max_idx为json
import json

# 将 int64 类型的值转换为标准的 Python int 类型
for key, value in feature_max_idx.items():
    if isinstance(value, np.int64):
        feature_max_idx[key] = int(value)

with open('ml1M_feature_max_idx.json', 'w') as json_file:
    json.dump(feature_max_idx, json_file, indent=4)

In [12]:
# 从 JSON 文件读取数据
with open('ml1M_feature_max_idx.json', 'r') as json_file:
    data_loaded = json.load(json_file)

print("Loaded data from JSON file:")
print(data_loaded)

Loaded data from JSON file:
{'movie_id': 3707, 'user_id': 6041, 'gender': 3, 'age': 8, 'occupation': 22, 'zip': 3440, 'genres': 19}
