# Import Required Packages

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Step 1: Data Collection

#### Load and inspect the data

In [22]:
import pandas as pd

# Load the dataset
url = 'booking_reviews_new2.csv'
data = pd.read_csv(url)  # Load

print("Columns in the dataset:")
print(data.columns)

# Display the first few rows of the DataFrame
print("\nFirst few rows of the dataset:")
print(data.head())

Columns in the dataset:
Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags',
       'days_since_review', 'lat', 'lng'],
      dtype='object')

First few rows of the dataset:
                                       Hotel_Address  \
0   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
1   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
2   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
3   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
4   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   

   Additional_Number_of_Scoring Review_Date  Average_Score   Hotel_Name  \
0                           194    8/3/2017            7.7  Hotel Arena   
1                   

# Step 2: Data Cleaning and Preprocessing

In [23]:
# Remove duplicates
data.drop_duplicates(inplace=True)

# Handle missing values
data.fillna(method='ffill', inplace=True)

# Verify the data after cleaning
print("\nData after cleaning:")
print(data.info())


Data after cleaning:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Hotel_Address                               20000 non-null  object 
 1   Additional_Number_of_Scoring                20000 non-null  int64  
 2   Review_Date                                 20000 non-null  object 
 3   Average_Score                               20000 non-null  float64
 4   Hotel_Name                                  20000 non-null  object 
 5   Reviewer_Nationality                        20000 non-null  object 
 6   Negative_Review                             20000 non-null  object 
 7   Review_Total_Negative_Word_Counts           20000 non-null  int64  
 8   Total_Number_of_Reviews                     20000 non-null  int64  
 9   Positive_Review                             20000 non-null  o

In [24]:
data.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


In [25]:
data.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


# Step 3: Exploratory Data Analysis (EDA)

In [None]:
rating_column = 'Reviewer_Score'  # Using the Reviewer_Score column for ratings
if rating_column in data.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(data[rating_column])
    plt.title('Distribution of Ratings')
    plt.show()
else:
    print(f"Column '{rating_column}' not found. Please check the column names.")

# Step 4: Split the Dataset into Training and Testing Sets

In [26]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)

Training data shape: (14000, 17)
Testing data shape: (6000, 17)


# Step 5: Create User-Item Matrix for Training Data

In [27]:
# Create user-item matrix
rating_column = 'Reviewer_Score'
user_item_matrix_train = train_data.pivot_table(index='Reviewer_Nationality', columns='Hotel_Name', values=rating_column).fillna(0)

# Step 6: Model Building

## 6.1 - Collaborative Filtering
### User-Based Collaborative Filtering

In [28]:
user_similarity_train = cosine_similarity(user_item_matrix_train)
user_similarity_df_train = pd.DataFrame(user_similarity_train, index=user_item_matrix_train.index, columns=user_item_matrix_train.index)

### Item-Based Collaborative Filtering

In [29]:
# Compute cosine similarity between hotels
item_similarity_train = cosine_similarity(user_item_matrix_train.T)
item_similarity_df_train = pd.DataFrame(item_similarity_train, index=user_item_matrix_train.columns, columns=user_item_matrix_train.columns)

## 6.2 - Content-Based Filtering
Explanation

TF-IDF Vectorizer: We use TfidfVectorizer to transform the combined features into a TF-IDF matrix.
Content-Based Filtering using sparse matrices

In [30]:
train_data['combined_features'] = train_data['Hotel_Name'] + ' ' + train_data['Positive_Review'] + ' ' + train_data['Negative_Review']
tfidf = TfidfVectorizer()
tfidf_matrix_train = tfidf.fit_transform(train_data['combined_features'])

# Apply Truncated SVD to reduce dimensionality
n_components = 100  # Number of components to keep
svd = TruncatedSVD(n_components=n_components, random_state=42)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix_train)

# Compute cosine similarity on the reduced matrix
content_similarity_train = cosine_similarity(tfidf_matrix_reduced)
content_similarity_df_train = pd.DataFrame(content_similarity_train, index=train_data['Hotel_Name'], columns=train_data['Hotel_Name'])

# Step 8: Define Recommendation Functions

In [31]:
content_similarity_df_train.head()
content_similarity_df_train['Intercontinental London The O2']

Hotel_Name,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,...,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2
Hotel_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Intercontinental London The O2,1.000000,0.223162,0.273673,0.283434,0.194094,0.214896,0.310097,0.342260,0.301569,0.373092,...,0.325280,0.451244,0.391160,0.474303,0.452510,0.442565,0.296178,0.245012,0.486070,0.344119
Intercontinental London The O2,0.223162,1.000000,0.238252,0.324616,0.386446,0.314691,0.421146,0.391855,0.568694,0.347597,...,0.322071,0.402310,0.260237,0.213123,0.241798,0.466113,0.229437,0.213489,0.317070,0.344068
Milestone Hotel Kensington,0.207167,-0.028621,-0.037751,0.010865,-0.024517,0.012095,0.007294,-0.030223,0.041602,0.023554,...,-0.003229,0.194613,0.193976,0.230604,0.151039,-0.001954,0.055486,0.000576,0.237939,0.040933
Grange St Paul s Hotel,0.124650,0.089079,0.206826,0.069226,0.042931,0.072920,0.040560,0.162481,0.222063,0.030465,...,0.085228,0.109301,0.059157,-0.000323,0.154511,-0.016934,0.083089,0.009593,0.020244,0.123525
Gardette Park Hotel,0.162838,0.312764,0.188353,0.211648,0.198372,0.125621,0.183054,0.494565,0.327444,0.102393,...,0.217069,0.289707,0.149130,0.064451,0.322729,0.150532,0.189450,0.148821,0.118372,0.189903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Milestone Hotel Kensington,0.293510,0.228885,0.041526,0.194361,0.164565,0.072158,0.209647,0.059746,0.295504,0.301167,...,0.250944,0.162077,0.286754,0.015590,0.162969,0.088938,0.259886,0.144299,0.079480,0.451335
Crowne Plaza London Kings Cross,0.090341,0.239292,0.052454,0.097429,0.394511,0.172782,0.275486,0.129339,0.279025,0.083590,...,0.143187,0.086577,0.208051,0.069809,0.072411,0.107001,0.051405,0.276198,0.091051,0.138898
Park Plaza County Hall London,0.199586,0.398566,0.141324,0.309574,0.302122,0.296612,0.177892,0.286142,0.497448,0.229993,...,0.420209,0.280803,0.237532,0.184225,0.179023,0.258079,0.248810,0.111180,0.140618,0.376091
K K Hotel George,0.120855,0.052994,0.076056,0.078357,0.111070,0.070371,0.049932,0.225757,0.160272,0.052168,...,0.154204,0.095004,0.087811,0.121557,0.307807,0.084296,0.120290,0.073407,0.118237,0.077292


In [32]:
content_similarity_df_train.loc[:,'Intercontinental London The O2']

Hotel_Name,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,...,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2,Intercontinental London The O2
Hotel_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Intercontinental London The O2,1.000000,0.223162,0.273673,0.283434,0.194094,0.214896,0.310097,0.342260,0.301569,0.373092,...,0.325280,0.451244,0.391160,0.474303,0.452510,0.442565,0.296178,0.245012,0.486070,0.344119
Intercontinental London The O2,0.223162,1.000000,0.238252,0.324616,0.386446,0.314691,0.421146,0.391855,0.568694,0.347597,...,0.322071,0.402310,0.260237,0.213123,0.241798,0.466113,0.229437,0.213489,0.317070,0.344068
Milestone Hotel Kensington,0.207167,-0.028621,-0.037751,0.010865,-0.024517,0.012095,0.007294,-0.030223,0.041602,0.023554,...,-0.003229,0.194613,0.193976,0.230604,0.151039,-0.001954,0.055486,0.000576,0.237939,0.040933
Grange St Paul s Hotel,0.124650,0.089079,0.206826,0.069226,0.042931,0.072920,0.040560,0.162481,0.222063,0.030465,...,0.085228,0.109301,0.059157,-0.000323,0.154511,-0.016934,0.083089,0.009593,0.020244,0.123525
Gardette Park Hotel,0.162838,0.312764,0.188353,0.211648,0.198372,0.125621,0.183054,0.494565,0.327444,0.102393,...,0.217069,0.289707,0.149130,0.064451,0.322729,0.150532,0.189450,0.148821,0.118372,0.189903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Milestone Hotel Kensington,0.293510,0.228885,0.041526,0.194361,0.164565,0.072158,0.209647,0.059746,0.295504,0.301167,...,0.250944,0.162077,0.286754,0.015590,0.162969,0.088938,0.259886,0.144299,0.079480,0.451335
Crowne Plaza London Kings Cross,0.090341,0.239292,0.052454,0.097429,0.394511,0.172782,0.275486,0.129339,0.279025,0.083590,...,0.143187,0.086577,0.208051,0.069809,0.072411,0.107001,0.051405,0.276198,0.091051,0.138898
Park Plaza County Hall London,0.199586,0.398566,0.141324,0.309574,0.302122,0.296612,0.177892,0.286142,0.497448,0.229993,...,0.420209,0.280803,0.237532,0.184225,0.179023,0.258079,0.248810,0.111180,0.140618,0.376091
K K Hotel George,0.120855,0.052994,0.076056,0.078357,0.111070,0.070371,0.049932,0.225757,0.160272,0.052168,...,0.154204,0.095004,0.087811,0.121557,0.307807,0.084296,0.120290,0.073407,0.118237,0.077292


In [33]:
similarity_series = content_similarity_df_train['Intercontinental London The O2']
# Exclude the hotel itself from the list
similarity_series = similarity_series[similarity_series.index != 'Intercontinental London The O2']
# Sort the similarity scores in descending order
similar_items = similarity_series.loc[:,'Intercontinental London The O2'].iloc[:,1].sort_values(ascending=False).head(10).index
similar_items
        

Index(['Haymarket Hotel', 'The Park Grand London Paddington',
       'Haymarket Hotel', 'H tel Juliana Paris',
       'Park Plaza County Hall London', 'The Principal London',
       'The Park Grand London Paddington', 'Park Plaza County Hall London',
       'Park Plaza County Hall London', 'Gardette Park Hotel'],
      dtype='object', name='Hotel_Name')

In [34]:
def get_user_based_recommendations(user_id, top_n=10):
    similar_users = user_similarity_df_train.loc[user_id].sort_values(ascending=False).head(top_n+1).index[1:]
    recommendations = user_item_matrix_train.loc[similar_users].mean().sort_values(ascending=False).head(top_n)
    return list(recommendations.items())

def get_item_based_recommendations(hotel_name, top_n=10):
    similar_items = item_similarity_df_train.loc[hotel_name].sort_values(ascending=False).head(top_n+1).index[1:]
    recommendations = user_item_matrix_train[similar_items].mean().sort_values(ascending=False).head(top_n)
    return list(recommendations.items())
    
def get_content_based_recommendations(hotel_name, top_n=10):
    if hotel_name in content_similarity_df_train.index:
        similarity_series = content_similarity_df_train[hotel_name]
        # Exclude the hotel itself from the list
        similarity_series = similarity_series[similarity_series.index != hotel_name]
        # Sort the similarity scores in descending order
        similar_items = similarity_series.loc[:,hotel_name].iloc[:,1].sort_values(ascending=False).head(10).index
        l = []
        for i in similar_items:
            if i != hotel_name:
                l.append(i)
        
        recommendations = user_item_matrix_train[l].mean().sort_values(ascending=False).head(top_n)
        recommendations.items()
        return list(recommendations.items())
    else:
        return []

# Step 9: Evaluate the Models

#### Define Evaluation Function

In [35]:
def evaluate_model(model_func, id, ground_truth_ratings, top_n=10):
    recommendations = model_func(id, top_n)
    predicted_ratings = [rating for _, rating in recommendations]
    mse = mean_squared_error(ground_truth_ratings, predicted_ratings)
    return mse

#### Define Ground Truth Ratings

In [36]:
# Define ground truth ratings for evaluation (example values)
ground_truth_ratings = [4.5, 4.0, 4.5, 3.0, 5.0, 4.0, 4.0, 4.5, 4.0, 4.5]

### Evaluate Models

In [37]:
get_content_based_recommendations('Intercontinental London The O2',10)

[('Park Plaza County Hall London', 4.654404269072611),
 ('Park Plaza County Hall London', 4.654404269072611),
 ('Park Plaza County Hall London', 4.654404269072611),
 ('The Park Grand London Paddington', 4.482081125087193),
 ('The Park Grand London Paddington', 4.482081125087193),
 ('The Principal London', 3.288508402571927),
 ('H tel Juliana Paris', 2.7591214896214895),
 ('Gardette Park Hotel', 2.3246346153846154),
 ('Haymarket Hotel', 1.4755850340136056),
 ('Haymarket Hotel', 1.4755850340136056)]

In [38]:
actual_user_id = "Kuwait"
actual_hotel_name = "Park Plaza County Hall London"

for user_id in test_data['Reviewer_Nationality']:
    if user_id in user_item_matrix_train.index:
        actual_user_id = user_id
        break

for hotel_name in test_data['Hotel_Name']:
    if hotel_name in user_item_matrix_train.columns:
        actual_hotel_name = hotel_name
        break

if actual_user_id is None or actual_hotel_name is None:
    raise ValueError("Could not find a common user_id and hotel_name in both training and testing sets.")

# Evaluate User-Based Collaborative Filtering
user_based_mse = evaluate_model(get_user_based_recommendations, actual_user_id, ground_truth_ratings)

# Evaluate Item-Based Collaborative Filtering
item_based_mse = evaluate_model(get_item_based_recommendations, actual_hotel_name, ground_truth_ratings)

# Evaluate Content-Based Filtering
content_based_mse = evaluate_model(get_content_based_recommendations, actual_hotel_name, ground_truth_ratings)

#### Print the results

In [39]:
print(f'User-Based Collaborative Filtering MSE: {user_based_mse}')
print(f'Item-Based Collaborative Filtering MSE: {item_based_mse}')
print(f'Content-Based Filtering MSE: {content_based_mse}')

User-Based Collaborative Filtering MSE: 21.68914687911704
Item-Based Collaborative Filtering MSE: 1.3433836425885466
Content-Based Filtering MSE: 2.0168351059445055


In [40]:
import pickle

# Assume these variables have been defined and trained in the notebook
# user_similarity_df_train, item_similarity_df_train, content_similarity_df_train, user_item_matrix_train

# Save models and matrices
with open('user_similarity_df_train.pkl', 'wb') as f:
    pickle.dump(user_similarity_df_train, f)
with open('item_similarity_df_train.pkl', 'wb') as f:
    pickle.dump(item_similarity_df_train, f)
with open('content_similarity_df_train.pkl', 'wb') as f:
    pickle.dump(content_similarity_df_train, f)
with open('user_item_matrix_train.pkl', 'wb') as f:
    pickle.dump(user_item_matrix_train, f)