In [11]:
import os, io, collections
import pandas as pd
import time
import random
import numpy as np
from surprise import Dataset, KNNBaseline, SVD, accuracy, Reader
from surprise.model_selection import cross_validate, train_test_split

# 协同过滤方法
# 载入movielens-100k数据集，一个经典的公开推荐系统数据集，有选项提示是否下载。
# data = Dataset.load_builtin('ml-100k')

# 或载入本地数据集# 数据集路径 path to dataset file
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
# 使用Reader指定文本格式，参数line_format指定特征（列名），参数sep指定分隔符
reader = Reader(line_format='user item rating timestamp', sep='\t')

# 加载数据集
data = Dataset.load_from_file(file_path, reader=reader)

data_df = pd.read_csv(file_path, sep='\t', header=None, names=['user','item','rating','timestamp'])
item_df = pd.read_csv(os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.item'),
                      sep='|', encoding='ISO-8859-1', header=None, names=['mid','mtitle']+[x for x in range(22)]) # index_col=0

print("dataframe maximun:\n", data_df.max())

indexarr = data_df.index.values
print(indexarr)
for it in indexarr[:50]:
    print(it)

for i in range(30):
    print(random.choice(indexarr))

# 每列都转换为字符串类型
data_df = data_df.astype(str)
item_df = item_df.astype(str)

item_dict = { item_df.loc[x, 'mid']: item_df.loc[x, 'mtitle'] for x in range(len(item_df)) }

dataframe maximun:
 user               943
item              1682
rating               5
timestamp    893286638
dtype: int64
[    0     1     2 ... 99997 99998 99999]
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
73138
37017
63465
57833
94598
58295
81718
59521
76481
95044
91915
14179
49983
50332
39050
76278
25788
69791
20903
83030
49976
43265
91005
4130
92196
87528
1729
94222
78202
91771


In [2]:
print(data_df.index[[0,1,2,3]])
data_df.head()

Int64Index([0, 1, 2, 3], dtype='int64')


Unnamed: 0,user,item,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [101]:
data_df_droped = data_df.drop([0, 4], axis=0)
data_df_droped.head()

Unnamed: 0,user,item,rating,timestamp
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
5,298,474,4,884182806
6,115,265,2,881171488


In [102]:
data_df_sorted = data_df.sort_values(by=["user", "timestamp"], ascending=[True, True])
data_df_sorted

Unnamed: 0,user,item,rating,timestamp
59972,1,168,5,874965478
92487,1,172,5,874965478
74577,1,165,5,874965518
48214,1,156,4,874965556
15764,1,196,5,874965677
...,...,...,...,...
1855,99,354,2,888469332
1347,99,246,3,888469392
13430,99,255,3,888469419
21785,99,275,1,888469419


In [103]:
def item_to_movie_name(item):
    return item_dict[item]

def seconds_to_ctime(seconds):
    seconds = seconds if type(seconds) == "int" else int(seconds)
    return time.ctime(seconds)

In [104]:
data_user = data_df.loc[data_df["user"] == "2"]
print(data_user)
print(len(data_user))

print("---- testing ----")

dest = data_df.loc[3]
print(dest)
print(dest["item"])

      user item rating  timestamp
700      2  292      4  888550774
924      2  251      5  888552084
1052     2   50      5  888552084
3425     2  314      1  888980085
5063     2  297      4  888550871
...    ...  ...    ...        ...
77906    2  288      3  888550252
85606    2  286      4  888549960
88190    2  275      5  888550939
95677    2  302      5  888552084
97619    2  296      3  888550871

[62 rows x 4 columns]
62
---- testing ----
user               244
item                51
rating               2
timestamp    880606923
Name: 3, dtype: object
51


In [90]:
for idx, row in data_user.iterrows():
    # print(row["item"])
    row["item"] = item_to_movie_name(row["item"])

data_user

Unnamed: 0,user,item,rating,timestamp
700,2,Rosewood (1997),4,888550774
924,2,Shall We Dance? (1996),5,888552084
1052,2,Star Wars (1977),5,888552084
3425,2,3 Ninjas: High Noon At Mega Mountain (1998),1,888980085
5063,2,Ulee's Gold (1997),4,888550871
...,...,...,...,...
77906,2,Scream (1996),3,888550252
85606,2,"English Patient, The (1996)",4,888549960
88190,2,Sense and Sensibility (1995),5,888550939
95677,2,L.A. Confidential (1997),5,888552084


In [91]:
for idx, row in data_user.iterrows():
    row["timestamp"] = seconds_to_ctime(row["timestamp"])

print(data_user)

      user                                         item rating  \
700      2                              Rosewood (1997)      4   
924      2                       Shall We Dance? (1996)      5   
1052     2                             Star Wars (1977)      5   
3425     2  3 Ninjas: High Noon At Mega Mountain (1998)      1   
5063     2                           Ulee's Gold (1997)      4   
...    ...                                          ...    ...   
77906    2                                Scream (1996)      3   
85606    2                  English Patient, The (1996)      4   
88190    2                 Sense and Sensibility (1995)      5   
95677    2                     L.A. Confidential (1997)      5   
97619    2                          Promesse, La (1996)      3   

                      timestamp  
700    Fri Feb 27 11:39:34 1998  
924    Fri Feb 27 12:01:24 1998  
1052   Fri Feb 27 12:01:24 1998  
3425   Wed Mar  4 10:54:45 1998  
5063   Fri Feb 27 11:41:11 1998  
...

In [106]:
source = "123"
print(type(source), source)

dest = int(source)
print(type(dest), dest)


<class 'str'> 123
<class 'int'> 123
