## 美国餐厅评分数据分析

In [4]:
from numpy.random import randn
import numpy as np
np.random.seed(123)
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_rows = 10

In [5]:
path = '../data/restaurant_rating_final.csv'
df = pd.read_csv(path)
df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2
...,...,...,...,...,...
1156,U1043,132630,1,1,1
1157,U1011,132715,1,1,0
1158,U1068,132733,1,1,0
1159,U1068,132594,1,1,1


In [19]:
mean_ratings = df.pivot_table(values=['rating','food_rating'], index='placeID',
                                 aggfunc='mean')
mean_ratings[:5]

Unnamed: 0_level_0,food_rating,rating
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132560,1.0,0.5
132561,1.0,0.75
132564,1.25,1.25
132572,1.0,1.0
132583,1.0,1.0


In [20]:
ratings_by_place = df.groupby('placeID').size()
ratings_by_place[:10]

placeID
132560     4
132561     4
132564     4
132572    15
132583     4
132584     6
132594     5
132608     6
132609     5
132613     6
dtype: int64

In [21]:
active_place = ratings_by_place.index[ratings_by_place >= 4]
active_place

Int64Index([132560, 132561, 132564, 132572, 132583, 132584, 132594, 132608,
            132609, 132613,
            ...
            135080, 135081, 135082, 135085, 135086, 135088, 135104, 135106,
            135108, 135109],
           dtype='int64', name='placeID', length=124)

In [22]:
# Select rows on the index
mean_ratings = mean_ratings.loc[active_place]
mean_ratings

Unnamed: 0_level_0,food_rating,rating
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132560,1.000000,0.500000
132561,1.000000,0.750000
132564,1.250000,1.250000
132572,1.000000,1.000000
132583,1.000000,1.000000
...,...,...
135088,1.166667,1.000000
135104,1.428571,0.857143
135106,1.200000,1.200000
135108,1.181818,1.181818


In [23]:
top_ratings = mean_ratings.sort_values(by='rating', ascending=False)
top_ratings[:10]

Unnamed: 0_level_0,food_rating,rating
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132955,1.8,2.0
135034,2.0,2.0
134986,2.0,2.0
132922,1.5,1.833333
132755,2.0,1.8
135074,1.75,1.75
135013,2.0,1.75
134976,1.75,1.75
135055,1.714286,1.714286
135075,1.692308,1.692308


In [24]:
mean_ratings['diff'] = mean_ratings['rating'] - mean_ratings['food_rating']

In [25]:
sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff[:10]

Unnamed: 0_level_0,food_rating,rating,diff
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
132667,2.0,1.25,-0.75
132594,1.2,0.6,-0.6
132858,1.4,0.8,-0.6
135104,1.428571,0.857143,-0.571429
132560,1.0,0.5,-0.5
135027,1.375,0.875,-0.5
132740,1.25,0.75,-0.5
134992,1.5,1.0,-0.5
132706,1.25,0.75,-0.5
132870,1.0,0.6,-0.4


In [26]:
# Reverse order of rows, take first 10 rows
sorted_by_diff[::-1][:10]

Unnamed: 0_level_0,food_rating,rating,diff
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
134987,0.5,1.0,0.5
132937,1.0,1.5,0.5
135066,1.0,1.5,0.5
132851,1.0,1.428571,0.428571
135049,0.6,1.0,0.4
132922,1.5,1.833333,0.333333
135030,1.333333,1.583333,0.25
135063,1.0,1.25,0.25
132626,1.0,1.25,0.25
135000,1.0,1.25,0.25


In [27]:
# Standard deviation of rating grouped by title
rating_std_by_title = df.groupby('placeID')['rating'].std()
# Filter down to active_titles
rating_std_by_title = rating_std_by_title.loc[active_place]
# Order Series by value in descending order
rating_std_by_title.sort_values(ascending=False)[:10]


placeID
134987    1.154701
135049    1.000000
134983    1.000000
135053    0.991031
135027    0.991031
132847    0.983192
132767    0.983192
132884    0.983192
135082    0.971825
132706    0.957427
Name: rating, dtype: float64