In [1]:
import pandas as pd

# 采用ml-1m中的数据
# users.dat：UserID、Gender、Age、Occupation、Zip-code
# movies.dat：MovieID、Title、Genres
# ratings.dat：UserID、MovieID、Rating、Timestamp

In [2]:
#在数据处理过程中，合并、透视、分组、排序这四大类操作是最经常用的
ratings = pd.read_table('ml-1m/ratings.dat', header=None, names=['UserID','MovieID','Rating','Timestamp'], sep='::')
users = pd.read_table('ml-1m/users.dat', header=None, names=['UserID','Gender','Age','Occupation','Zip-code'], sep='::')
movies = pd.read_table('ml-1m/movies.dat', header=None, names=['MovieID','Title','Genres'], sep='::')
print(ratings.shape)
print(ratings.head(10))
print(users.head(10))
print(movies.head(10))

  


(1000209, 4)
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
5       1     1197       3  978302268
6       1     1287       5  978302039
7       1     2804       5  978300719
8       1      594       4  978302268
9       1      919       4  978301368
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
5       6      F   50           9    55117
6       7      M   35           1    06810
7       8      M   25          12    11413
8       9      M   25          17    61614
9      10      F   35           1    95370
   MovieID                               Title                        Genres
0        1         

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [4]:
moviestype = movies.groupby('Genres', as_index=False).size()
print(moviestype)

Genres
Action                                            65
Action|Adventure                                  25
Action|Adventure|Animation                         1
Action|Adventure|Animation|Children's|Fantasy      1
Action|Adventure|Animation|Horror|Sci-Fi           1
                                                ... 
Sci-Fi|Thriller|War                                1
Sci-Fi|War                                         1
Thriller                                         101
War                                               12
Western                                           33
Length: 301, dtype: int64


In [5]:
#在数据分析中，将不同的表合并起来汇聚成一张总表，会更加便于我们后面的处理与分析。
data = pd.merge(pd.merge(users, ratings, on='UserID'),movies, on='MovieID')
#data.head(10)
print(data[data.UserID == 1])

       UserID Gender  Age  Occupation Zip-code  MovieID  Rating  Timestamp  \
0           1      F    1          10    48067     1193       5  978300760   
1725        1      F    1          10    48067      661       3  978302109   
2250        1      F    1          10    48067      914       3  978301968   
2886        1      F    1          10    48067     3408       4  978300275   
4201        1      F    1          10    48067     2355       5  978824291   
5904        1      F    1          10    48067     1197       3  978302268   
8222        1      F    1          10    48067     1287       5  978302039   
8926        1      F    1          10    48067     2804       5  978300719   
10278       1      F    1          10    48067      594       4  978302268   
11041       1      F    1          10    48067      919       4  978301368   
12759       1      F    1          10    48067      595       5  978824268   
13819       1      F    1          10    48067      938       4 

In [6]:
#不同性别对电影的平均评分
#采用数据透视，建立以Title为行索引，Gender为列索引，mean为聚合方法来显示Rating中的数据。
data_gender = data.pivot_table(values='Rating', index='Title',columns='Gender', aggfunc='mean')
print(data_gender.head(10))
#向data_gender数据表中新插入了一列difference，用来存放男女用户评分的差值
#对difference列降序排列（或者升序），即可看到不同性别用户对相同电影评分差异最大的电影了
data_gender['difference'] = data_gender.F - data_gender.M
data_gender_sorted = data_gender.sort_values(by='difference', ascending=False)
data_gender_sorted.head(10)

Gender                                    F         M
Title                                                
$1,000,000 Duck (1971)             3.375000  2.761905
'Night Mother (1986)               3.388889  3.352941
'Til There Was You (1997)          2.675676  2.733333
'burbs, The (1989)                 2.793478  2.962085
...And Justice for All (1979)      3.828571  3.689024
1-900 (1994)                       2.000000  3.000000
10 Things I Hate About You (1999)  3.646552  3.311966
101 Dalmatians (1961)              3.791444  3.500000
101 Dalmatians (1996)              3.240000  2.911215
12 Angry Men (1957)                4.184397  4.328421


Gender,F,M,difference
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"James Dean Story, The (1957)",4.0,1.0,3.0
"Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919)",4.0,1.0,3.0
Country Life (1994),5.0,2.0,3.0
Babyfever (1994),3.666667,1.0,2.666667
"Woman of Paris, A (1923)",5.0,2.428571,2.571429
Cobra (1925),4.0,1.5,2.5
"Other Side of Sunday, The (S鴑dagsengler) (1996)",5.0,2.928571,2.071429
Theodore Rex (1995),3.0,1.0,2.0
For the Moment (1994),5.0,3.0,2.0
"Separation, The (La S閜aration) (1994)",4.0,2.0,2.0


In [7]:
#平均分较高的电影
data_mean_rating = data.pivot_table(values='Rating',index='Title',aggfunc='mean')
# data_mean_rating.head(10)
data_mean_rating.sort_values(by='Rating', ascending = False, inplace=True)
data_mean_rating.head(10)

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
Ulysses (Ulisse) (1954),5.0
Lured (1947),5.0
Follow the Bitch (1998),5.0
Bittersweet Motel (2000),5.0
Song of Freedom (1936),5.0
One Little Indian (1973),5.0
Smashing Time (1967),5.0
Schlafes Bruder (Brother of Sleep) (1995),5.0
"Gate of Heavenly Peace, The (1995)",5.0
"Baby, The (1973)",5.0


In [8]:
#评分次数最多热门的电影
#利用数据分组操作对Title进行分组，并用size()聚合函数即可统计出每个Title出现的次数，即评分次数
data_rating_num = data.groupby('Title').size()
# data_rating_num = data.groupby(['MovieID','Title'], as_index=False)['Rating'].count()
#按照评论最多的倒序排序
data_rating_num.sort_values(ascending=False, inplace=True)
# data_rating_num.sort_values(by =['Rating'], ascending=False, inplace=True)
data_rating_num.head(10)

Title
American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
dtype: int64

In [9]:
#这个问题就在于：有些电影只有极少数的人（1-2人）看过，并且觉得很好看，给了很高的评分，这个时候我们去分析数据的时候，
#得到的就是这种极小众认为好看的电影。因此，我们应该对评分次数做出最小值限定，使数据更加合理：

#加入评分次数限制不同性别对电影的平均评分
#取出评分次数大于1000的电影的索引
data_gender_hot = data_gender.loc[data_rating_num[data_rating_num > 1000].index]
data_gender_hot.sort_values(by = 'difference',ascending=False)

Gender,F,M,difference
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Rocky Horror Picture Show, The (1975)",3.673016,3.160131,0.512885
Mary Poppins (1964),4.197740,3.730594,0.467147
Gone with the Wind (1939),4.269841,3.829371,0.440471
"Full Monty, The (1997)",4.113456,3.760976,0.352481
"Little Mermaid, The (1989)",3.975936,3.632375,0.343561
...,...,...,...
Predator (1987),3.299401,3.706195,-0.406793
Airplane! (1980),3.656566,4.064419,-0.407854
"South Park: Bigger, Longer and Uncut (1999)",3.422481,3.846686,-0.424206
Reservoir Dogs (1992),3.769231,4.213873,-0.444642


In [10]:
#加入评分次数限制的分析平均分高的电影
data_mean_rating_number = data_mean_rating.loc[data_rating_num[data_rating_num > 1000].index]
data_mean_rating_number.sort_values(by='Rating',ascending=False)

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
"Shawshank Redemption, The (1994)",4.554558
"Godfather, The (1972)",4.524966
"Usual Suspects, The (1995)",4.517106
Schindler's List (1993),4.510417
Raiders of the Lost Ark (1981),4.477725
...,...
"Blair Witch Project, The (1999)",3.031528
Arachnophobia (1990),3.002926
Batman Returns (1992),2.976722
"Honey, I Shrunk the Kids (1989)",2.933014
