In [112]:
import pandas as pd
import numpy as np

movies_file = r'./movies.dat'
users_file = r'./users.dat'
ratings_file = r'./ratings.dat'

## auxInfo的生成

### 用户信息导入

In [113]:
#  用户信息
uname = ['user_id','gender','age','occupation','zip']
users = pd.read_table(users_file, sep='::', header = None, names=uname, engine='python')
users = users[['user_id','gender','age','occupation']]
print(users.head())

   user_id gender  age  occupation
0        1      F    1          10
1        2      M   56          16
2        3      M   25          15
3        4      M   45           7
4        5      M   25          20


In [114]:
aux_info = pd.DataFrame()
aux_info['user_id'] = users['user_id']
print(aux_info.head())

   user_id
0        1
1        2
2        3
3        4
4        5


### 年龄的离散化

In [115]:
ages = pd.Series(users['age'].unique()).sort_values().reset_index(drop=True)
aux_info['age'] = users['age'].map(lambda x: ages[ages.values == x].index[0] + 1)
print(aux_info.head())

   user_id  age
0        1    1
1        2    7
2        3    3
3        4    5
4        5    3


### one-hot encoding

In [116]:
# 性别热独编码
aux_info = pd.concat([aux_info, pd.get_dummies(users['gender'])], axis=1)
# 职业热独编码
aux_info = pd.concat([aux_info, pd.get_dummies(users['occupation'])], axis=1)
print(aux_info.head())

   user_id  age  F  M  0  1  2  3  4  5  ...  11  12  13  14  15  16  17  18  \
0        1    1  1  0  0  0  0  0  0  0  ...   0   0   0   0   0   0   0   0   
1        2    7  0  1  0  0  0  0  0  0  ...   0   0   0   0   0   1   0   0   
2        3    3  0  1  0  0  0  0  0  0  ...   0   0   0   0   1   0   0   0   
3        4    5  0  1  0  0  0  0  0  0  ...   0   0   0   0   0   0   0   0   
4        5    3  0  1  0  0  0  0  0  0  ...   0   0   0   0   0   0   0   0   

   19  20  
0   0   0  
1   0   0  
2   0   0  
3   0   0  
4   0   1  

[5 rows x 25 columns]


### 存储auxInfo

In [117]:
aux_info.to_csv('aux_info.csv', index=False)

## mianInfo的生成

### 电影信息导入

In [118]:
#  电影信息
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(movies_file, sep='::', header=None, names=mnames, engine='python')
print(movies)

      movie_id                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
3878      3948             Meet the Parents (2000)   
3879      3949          Requiem for a Dream (2000)   
3880      3950                    Tigerland (2000)   
3881      3951             Two Family House (2000)   
3882      3952               Contender, The (2000)   

                            genres  
0      Animation|Children's|Comedy  
1     Adventure|Children's|Fantasy  
2                   Comedy|Romance  
3                     Comedy|Drama  
4                           Comedy  
...                            ...  
3878                        Comedy  
3879                         Drama  
3880              

### 电影信息处理

In [119]:
# 将电影类型拆为多行
movies_result = movies.drop(['genres'], axis=1).join(movies['genres'].str.split('|', expand=True).stack().reset_index(level=1, drop=True).rename('genres'))
# movies_result.drop(['title'], axis=1, inplace=True)
print(movies_result.head(20))

   movie_id                               title      genres
0         1                    Toy Story (1995)   Animation
0         1                    Toy Story (1995)  Children's
0         1                    Toy Story (1995)      Comedy
1         2                      Jumanji (1995)   Adventure
1         2                      Jumanji (1995)  Children's
1         2                      Jumanji (1995)     Fantasy
2         3             Grumpier Old Men (1995)      Comedy
2         3             Grumpier Old Men (1995)     Romance
3         4            Waiting to Exhale (1995)      Comedy
3         4            Waiting to Exhale (1995)       Drama
4         5  Father of the Bride Part II (1995)      Comedy
5         6                         Heat (1995)      Action
5         6                         Heat (1995)       Crime
5         6                         Heat (1995)    Thriller
6         7                      Sabrina (1995)      Comedy
6         7                      Sabrina

In [120]:
print(movies_result[movies_result['movie_id'] == 661])

     movie_id                             title      genres
655       661  James and the Giant Peach (1996)   Animation
655       661  James and the Giant Peach (1996)  Children's
655       661  James and the Giant Peach (1996)     Musical


### 评分信息导入

In [121]:
# 评分信息
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table(ratings_file, header =None, sep='::',names=rnames, engine= 'python')
print(ratings.head())

   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291


### 评分信息处理

In [122]:
ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings["year"] = ratings["datetime"].dt.year.fillna(0).astype("int")
ratings["month"] = ratings["datetime"].dt.month.fillna(0).astype("int")
# 求最小的年份以及最小年份的最小月份
minyear = ratings["year"].min()
minmonth = ratings[ratings["year"] == minyear]["month"].min()
# 最小年份的最小月份 计数为 1
ratings["time"] = (ratings["year"] - minyear) * 12 + ratings["month"] - minmonth + 1
ratings = ratings[['user_id', 'movie_id', 'time']]
print(ratings)

         user_id  movie_id  time
0              1      1193     9
1              1       661     9
2              1       914     9
3              1      3408     9
4              1      2355    10
...          ...       ...   ...
1000204     6040      1091     1
1000205     6040      1094     1
1000206     6040       562     1
1000207     6040      1096     1
1000208     6040      1097     1

[1000209 rows x 3 columns]


In [123]:
print(ratings[ratings['movie_id'] == 1193])

        user_id  movie_id  time
0             1      1193     9
120           2      1193     9
1339         12      1193     9
1518         15      1193     9
1747         17      1193     9
...         ...       ...   ...
998284     6033      1193     1
998423     6035      1193     1
998894     6036      1193     1
999580     6037      1193     1
999958     6040      1193     2

[1725 rows x 3 columns]


### 电影信息和评分信息的拼接

In [124]:
merge_data = pd.merge(ratings, movies_result, how='left', on='movie_id')
print(merge_data.head(10))

   user_id  movie_id  time                                   title      genres
0        1      1193     9  One Flew Over the Cuckoo's Nest (1975)       Drama
1        1       661     9        James and the Giant Peach (1996)   Animation
2        1       661     9        James and the Giant Peach (1996)  Children's
3        1       661     9        James and the Giant Peach (1996)     Musical
4        1       914     9                     My Fair Lady (1964)     Musical
5        1       914     9                     My Fair Lady (1964)     Romance
6        1      3408     9                  Erin Brockovich (2000)       Drama
7        1      2355    10                    Bug's Life, A (1998)   Animation
8        1      2355    10                    Bug's Life, A (1998)  Children's
9        1      2355    10                    Bug's Life, A (1998)      Comedy


In [125]:
temp = merge_data.groupby(['user_id', 'time', 'genres']).size().reset_index(name='counts')
print(temp.head(10))

   user_id  time      genres  counts
0        1     9      Action       5
1        1     9   Adventure       4
2        1     9   Animation       7
3        1     9  Children's      10
4        1     9      Comedy       9
5        1     9       Crime       2
6        1     9       Drama      19
7        1     9     Fantasy       3
8        1     9     Musical       9
9        1     9     Romance       5


In [126]:
# 获取所有的电影类型
genres = pd.Series(temp['genres'].unique()).sort_values().reset_index(drop=True)
# 将电影类型映射为数字
temp['genres_id'] = temp['genres'].map(lambda x: genres[genres.values == x].index[0] + 1)
print(temp.head(10))

   user_id  time      genres  counts  genres_id
0        1     9      Action       5          1
1        1     9   Adventure       4          2
2        1     9   Animation       7          3
3        1     9  Children's      10          4
4        1     9      Comedy       9          5
5        1     9       Crime       2          6
6        1     9       Drama      19          8
7        1     9     Fantasy       3          9
8        1     9     Musical       9         12
9        1     9     Romance       5         14


In [127]:
main_info = temp[['user_id', 'genres_id',  'time', 'counts']]
print(main_info.head())

   user_id  genres_id  time  counts
0        1          1     9       5
1        1          2     9       4
2        1          3     9       7
3        1          4     9      10
4        1          5     9       9


In [128]:
print(main_info['user_id'].max())
print(main_info['genres_id'].max())
print(main_info['time'].max())

6040
18
35


### 存储mainInfo

In [129]:
main_info.to_csv('main_info.csv', index=False)