In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
import matplotlib
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

In [127]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/movielens/users.dat', sep='::',
                      header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',
                        header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys
  # This is added back by InteractiveShellApp.init_path()


In [3]:
users

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [4]:
users.head(10)

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


In [5]:
users[['user_id','gender']]

Unnamed: 0,user_id,gender
0,1,F
1,2,M
2,3,M
3,4,M
4,5,M
...,...,...
6035,6036,F
6036,6037,F
6037,6038,F
6038,6039,F


In [6]:
users.loc[:,['user_id','gender']]

Unnamed: 0,user_id,gender
0,1,F
1,2,M
2,3,M
3,4,M
4,5,M
...,...,...
6035,6036,F
6036,6037,F
6037,6038,F
6038,6039,F


In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
user_id      1000209 non-null int64
movie_id     1000209 non-null int64
rating       1000209 non-null int64
timestamp    1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


In [8]:
ratings['user_id'].unique()

array([   1,    2,    3, ..., 6038, 6039, 6040], dtype=int64)

In [9]:
ratings['user_id'].nunique()

6040

In [10]:
ratings[ratings['user_id'] == 55]

Unnamed: 0,user_id,movie_id,rating,timestamp
8103,55,589,5,977948346
8104,55,1266,3,977948346
8105,55,593,4,977939130
8106,55,678,4,977942882
8107,55,3101,3,977939597
...,...,...,...,...
8123,55,50,3,977942911
8124,55,527,5,977942911
8125,55,457,5,977948394
8126,55,3672,4,977939597


In [11]:
ratings[(ratings['user_id'] == 55) & (ratings['rating'] >3)]

Unnamed: 0,user_id,movie_id,rating,timestamp
8103,55,589,5,977948346
8105,55,593,4,977939130
8106,55,678,4,977942882
8108,55,3114,5,977943112
8109,55,912,4,977939165
8110,55,1704,5,977943112
8112,55,3250,4,977939083
8113,55,110,5,977943155
8114,55,150,4,977948415
8115,55,1784,4,977948394


In [12]:
ratings[(ratings['user_id'] == 55) | (ratings['rating'] >3)]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
3,1,3408,4,978300275
4,1,2355,5,978824291
6,1,1287,5,978302039
7,1,2804,5,978300719
...,...,...,...,...
1000202,6040,1089,4,956704996
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [13]:
users[users['user_id'].notna()]

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [14]:
users[users['user_id'].notnull()]

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [19]:
ratings.groupby('movie_id')['rating'].count()

movie_id
1       2077
2        701
3        478
4        170
5        296
        ... 
3948     862
3949     304
3950      54
3951      40
3952     388
Name: rating, Length: 3706, dtype: int64

In [20]:
ratings.groupby('movie_id')['rating'].size()

movie_id
1       2077
2        701
3        478
4        170
5        296
        ... 
3948     862
3949     304
3950      54
3951      40
3952     388
Name: rating, Length: 3706, dtype: int64

In [21]:
ratings.groupby('movie_id')['rating'].nunique()

movie_id
1       5
2       5
3       5
4       5
5       5
       ..
3948    5
3949    5
3950    5
3951    5
3952    5
Name: rating, Length: 3706, dtype: int64

In [37]:
ratings.groupby('user_id').agg({'movie_id':np.size , 'rating':np.mean})

Unnamed: 0_level_0,movie_id,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,53,4.188679
2,129,3.713178
3,51,3.901961
4,21,4.190476
5,198,3.146465
...,...,...
6036,888,3.302928
6037,202,3.717822
6038,20,3.800000
6039,123,3.878049


Unnamed: 0_level_0,movie_id_user,rating_avg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,53,4.188679
2,129,3.713178
3,51,3.901961
4,21,4.190476
5,198,3.146465
...,...,...
6036,888,3.302928
6037,202,3.717822
6038,20,3.800000
6039,123,3.878049


In [41]:
rat_user = pd.merge(ratings, users)
rat_user

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip
0,1,1193,5,978300760,F,1,10,48067
1,1,661,3,978302109,F,1,10,48067
2,1,914,3,978301968,F,1,10,48067
3,1,3408,4,978300275,F,1,10,48067
4,1,2355,5,978824291,F,1,10,48067
...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106
1000205,6040,1094,5,956704887,M,25,6,11106
1000206,6040,562,5,956704746,M,25,6,11106
1000207,6040,1096,4,956715648,M,25,6,11106


In [58]:
movies_02 = movies.head(3)
movies_02.loc[1,'title'] = 'oppo'
movies_02

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,oppo,Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [59]:
movies_new = pd.concat([movies,movies_02])
movies_new

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,oppo,Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,oppo,Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,oppo,Adventure|Children's|Fantasy


In [60]:
movies_n = pd.concat([movies,movies_02]).drop_duplicates()
movies_n

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,oppo,Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,oppo,Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [68]:
ratings.groupby('user_id').agg({'movie_id':np.size , 'rating':np.mean}).sort_values(by='rating',ascending=False)

Unnamed: 0_level_0,movie_id,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
283,27,4.962963
2339,23,4.956522
3324,21,4.904762
3902,165,4.890909
446,51,4.843137
...,...,...
5850,58,1.844828
4539,119,1.815126
2744,138,1.304348
4486,51,1.058824


In [72]:
ratings_df = ratings.groupby('user_id').agg({'movie_id':np.size , 'rating':np.mean})
ratings_df.rename(columns={'movie_id':'movie_id_user' , 'rating':'rating_avg'},inplace=True)
ratings_df.sort_values(by=['user_id','movie_id_user'], ascending=[True,False])

Unnamed: 0_level_0,movie_id_user,rating_avg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,53,4.188679
2,129,3.713178
3,51,3.901961
4,21,4.190476
5,198,3.146465
...,...,...
6036,888,3.302928
6037,202,3.717822
6038,20,3.800000
6039,123,3.878049


In [93]:
users.loc[users['age']>50, 'zip']=555

In [102]:
users_new = users[users['age']!=35]
users_new

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,555
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,555
6038,6039,F,45,0,01060


In [103]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [104]:
movies.drop('title',inplace=True,axis=1)

In [105]:
movies

Unnamed: 0,movie_id,genres
0,1,Animation|Children's|Comedy
1,2,Adventure|Children's|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama
4,5,Comedy
...,...,...
3878,3948,Comedy
3879,3949,Drama
3880,3950,Drama
3881,3951,Drama


In [107]:
users

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [120]:
users['hand'] =5
users

Unnamed: 0,user_id,gender,age,occupation,zip,hand
0,1,F,1,10,48067,5
1,2,M,56,16,70072,5
2,3,M,25,15,55117,5
3,4,M,45,7,02460,5
4,5,M,25,20,55455,5
...,...,...,...,...,...,...
6035,6036,F,25,15,32603,5
6036,6037,F,45,1,76006,5
6037,6038,F,56,1,14706,5
6038,6039,F,45,0,01060,5


In [117]:
users.drop(['hand','kk'],inplace=True,axis=1)
users

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [119]:
users

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [135]:
movies = movies.append([{'movie_id':107,'title':'bull','genres':'drama'}],ignore_index=True)
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller
3883,103,bull,drama
3884,105,bull,drama


In [136]:
users

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060
