In [346]:
import pandas as pd 
import numpy as np

# Read the data
df = pd.read_csv('restaurants.csv')
df = df[df.columns[:4]]
df

Unnamed: 0,Restaurant,Reviewer,Review,Rating
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5
...,...,...,...,...
9995,Chinese Pavilion,Abhishek Mahajan,Madhumathi Mahajan Well to start with nice cou...,3
9996,Chinese Pavilion,Sharad Agrawal,This place has never disappointed us.. The foo...,4.5
9997,Chinese Pavilion,Ramandeep,"Bad rating is mainly because of ""Chicken Bone ...",1.5
9998,Chinese Pavilion,Nayana Shanbhag,I personally love and prefer Chinese Food. Had...,4


In [347]:
df.isnull().sum()

Restaurant     0
Reviewer      38
Review        45
Rating        38
dtype: int64

In [348]:
# drop nulls
df = df.dropna()
df.isnull().sum()

Restaurant    0
Reviewer      0
Review        0
Rating        0
dtype: int64

In [349]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9955 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Restaurant  9955 non-null   object
 1   Reviewer    9955 non-null   object
 2   Review      9955 non-null   object
 3   Rating      9955 non-null   object
dtypes: object(4)
memory usage: 388.9+ KB


In [350]:
# drop all restaurants that appear only once
df['Reviewer'].value_counts().sort_values(ascending=True)

Reviewer
Meha                   1
Mousumi Choudhary      1
Ram Prasad             1
Moha,ed Jalaludeen     1
Dipjyoti Deka          1
                      ..
Vedant Killa          11
Jay Mehta             11
Kiran                 12
Parijat Ray           13
Ankita                13
Name: count, Length: 7442, dtype: int64

In [351]:
df = df.groupby('Reviewer').filter(lambda x : len(x)>1)
# reset indexes
df = df.reset_index(drop=True)
dfp= df.transpose()
dfp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3844,3845,3846,3847,3848,3849,3850,3851,3852,3853
Restaurant,Beyond Flavours,Beyond Flavours,Beyond Flavours,Beyond Flavours,Beyond Flavours,Beyond Flavours,Beyond Flavours,Beyond Flavours,Beyond Flavours,Beyond Flavours,...,Chinese Pavilion,Chinese Pavilion,Chinese Pavilion,Chinese Pavilion,Chinese Pavilion,Chinese Pavilion,Chinese Pavilion,Chinese Pavilion,Chinese Pavilion,Chinese Pavilion
Reviewer,Dileep,Pradeep Vetapalem,Shubham Jaiswal,Tony,Imteja7,Nisha Gahlawat,Ankita Sinha,Prasanna,Sugandha Saxena,Sind Hu,...,Deb,Abhishek Mandal,Foodie Cookie,Suvashri Dey,Vishesh Agarwal,Sumanth,Karthik Gandhi,Sidharth Singal,Manasi Puranik,Abhishek Mahajan
Review,Food is good.we ordered Kodi drumsticks and ba...,Food is too good. Telangana kodiak fry is must...,Came here for lunch and the food was good and ...,We had our farewell feast here recently enjoye...,The place is very good.. 5* to the live music....,Sonalin has a great voice.. 😍 must visit the p...,Sonalin is a very good singer in the city.. be...,"Wonderful place with awesome ambience, best to...",My review of the place comes after multiple vi...,The reason for giving only a 3 star is because...,...,Lust for authentic Chinese food takes me to th...,Recently visited this place. Decor is simple y...,"I have tried prawn pepper chili , Chinese pavi...","Liked the food, the Tom yum soup was okay. The...",It's very over rated on Zomato. Went to check ...,Just went due to the zomato rating. Small plac...,Chinese Cuisine including diverse styles from ...,Had dinner here yesterday. Went here without r...,Writing this right after having a very satisfy...,Madhumathi Mahajan Well to start with nice cou...
Rating,5,5,4,5,5,5,5,5,4,3,...,3.5,3,3,4,3,4,4.5,4,4.5,3


In [352]:
# change Ratig to float, drop rows with wrong values
df['Rating'].value_counts()
# wtf

Rating
4       1306
5       1164
3        639
1        427
2        265
4.5       22
3.5       18
2.5        9
1.5        3
Like       1
Name: count, dtype: int64

In [353]:
df = df[df['Rating'] != 'Like']
df['Rating'] = df['Rating'].astype(float)
df['Rating'].value_counts()

Rating
4.0    1306
5.0    1164
3.0     639
1.0     427
2.0     265
4.5      22
3.5      18
2.5       9
1.5       3
Name: count, dtype: int64

In [354]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, df.Rating, test_size=0.25, random_state=42)
from IPython.core.display_functions import display

# pivot ratings to user-item matrix
ratings = X_train.pivot_table(index=["Reviewer"], columns=["Restaurant"], values="Rating")
mean_ratings = ratings.mean(axis=1)
display(ratings.shape)
ratings.head()

(1284, 100)

Restaurant,10 Downing Street,13 Dhaba,"3B's - Buddies, Bar & Barbecue",AB's - Absolute Barbecues,Absolute Sizzlers,Al Saba Restaurant,American Wild Wings,Amul,Arena Eleven,Aromas@11SIX,...,The Tilt Bar Republic,Tiki Shack,Triptify,Udipi's Upahar,Ulavacharu,Urban Asia - Kitchen & Bar,Yum Yum Tree - The Arabian Food Court,Zega - Sheraton Hyderabad Hotel,Zing's Northeast Kitchen,eat.fit
Reviewer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABDUL MAJEED,,,,,,,,,,,...,,,,,,,,,,
AJ,,,,,,,,,,,...,,,,,,,,5.0,,
ASH&B2,,,,,,,,,,,...,,,,,,,,,,4.0
Aamir Nawaz,,,,,,,,,,,...,,,,,,4.0,,,,
Aanchal Khemka,,,,,,,,,,,...,,,,,,,,,,


In [355]:
# mean rating per reviewer
mean_ratings.value_counts().sort_values(ascending=True)

1.333333      1
3.125000      1
4.071429      1
4.125000      1
4.428571      1
2.400000      1
3.571429      1
2.375000      1
3.333333      1
3.777778      1
3.875000      1
3.300000      1
4.625000      1
3.312500      1
4.285714      1
3.428571      1
4.400000      1
3.900000      1
2.600000      1
3.166667      1
3.583333      2
2.833333      2
3.800000      2
3.888889      2
2.333333      2
4.166667      2
4.200000      3
4.600000      3
3.625000      3
3.833333      3
2.800000      3
1.666667      3
1.750000      3
2.250000      4
3.200000      4
3.400000      7
2.750000      7
4.750000      9
4.250000     10
3.750000     11
2.666667     11
3.250000     13
4.666667     14
1.500000     15
3.333333     22
4.333333     28
3.666667     33
2.000000     52
2.500000     52
1.000000     67
3.500000    105
4.500000    117
3.000000    157
5.000000    214
4.000000    279
Name: count, dtype: int64

In [356]:
# normalize mean ratings to be 1-5
mean_ratings = mean_ratings.apply(lambda x: round(x))
mean_ratings

Reviewer
ABDUL MAJEED              5
AJ                        3
ASH&B2                    3
Aamir Nawaz               3
Aanchal Khemka            4
                         ..
_ Gluttonous Hedonist     4
|| Viharika Rathode ||    3
अभिषेक कुमार              5
✔️ Sonu                   4
🍛🍲🥗                       3
Length: 1284, dtype: int64

In [357]:

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
# Assuming df_ratings is your ratings DataFrame
df_ratings_dummy = ratings.fillna(0)
similarity_matrix = cosine_similarity(df_ratings_dummy, df_ratings_dummy)
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=ratings.index, columns=ratings.index)


def get_all_recommendations(user_id, use_means=True):
    if user_id in similarity_matrix_df.index:
        cosine_scores = similarity_matrix_df.loc[user_id]  # Similarity of user_id with every other user
        
        if use_means:
            recommendations = {}
            for movie_id in ratings.columns:
                ratings_scores = ratings[movie_id]  # Ratings of every other user for the movie movie_id
                index_not_rated = ratings_scores[ratings_scores.isnull()].index
                ratings_scores = ratings_scores.dropna()
                cosine_scores_movie = cosine_scores.drop(index_not_rated)
                
                # Calculating rating by weighted mean of ratings and cosine scores of the users who have rated the movie
                if cosine_scores_movie.sum() != 0:
                    ratings_movie = np.dot(ratings_scores, cosine_scores_movie) / cosine_scores_movie.sum()
                    recommendations[movie_id] = ratings_movie
                else:
                    recommendations[movie_id] = 2.5  # Default rating if no similar users have rated the movie
                
            return pd.Series(recommendations, name='recommendation')
        else:
            # Implement the non-mean adjusted approach if needed
            pass
    else:
        return pd.Series(name='recommendation')
    
def get_recommendation(user_id, movie_id, use_means=True):
    recommendations = get_all_recommendations(user_id, use_means=use_means)
    return recommendations.get(movie_id, 2.5)  # Default rating of 2.5 if movie_id is not found

def score_on_test_set():
    user_restaurant_pairs = zip(X_test['Reviewer'], X_test['Restaurant'])  # Assuming 'Reviewer' and 'Restaurant' are the column names
    predicted_ratings = np.array([get_recommendation(user, restaurant) for (user, restaurant) in user_restaurant_pairs])
    true_ratings = np.array(X_test['Rating'])  # Assuming 'Rating' is the column name
    result = pd.DataFrame({'predicted': predicted_ratings, 'true': true_ratings})
    print(result)
    score = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    return score

test_set_score = score_on_test_set()
print(test_set_score)
# The RMSE is 1.59, that is, on average, the predicted rating is 1.59 away from the true rating.


     predicted  true
0     2.500000   5.0
1     4.000000   4.0
2     5.000000   1.0
3     2.882642   4.0
4     3.524260   5.0
..         ...   ...
959   2.500000   3.0
960   3.756887   3.0
961   4.000000   5.0
962   2.500000   5.0
963   2.000000   1.0

[964 rows x 2 columns]
1.598553365347546
