<a href="https://colab.research.google.com/github/Kushl143/feynee-labs/blob/main/Copy_of_Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Step 1: Load and Explore Dataset
df = pd.read_csv("/content/google_hotel_data_clean_v1.csv")
print(df.head())
print(df.info())
print(df.isnull().sum())



                                 Hotel_Name  Hotel_Rating   City  \
0                      Trident Hotel Cochin           4.4  kochi   
1  The Gateway Hotel Marine Drive Ernakulam           4.3  kochi   
2                   Ramada by Wyndham Kochi           4.5  kochi   
3                          The Renai cochin           4.2  kochi   
4                 SpringField Billets Hotel           4.2  kochi   

      Feature_1       Feature_2   Feature_3     Feature_4 Feature_5  \
0  5-star hotel  Free breakfast       Wi-Fi  Free parking      Pool   
1  5-star hotel  Free breakfast  Free Wi-Fi  Free parking      Pool   
2  5-star hotel       Breakfast  Free Wi-Fi  Free parking      Pool   
3  4-star hotel  Free breakfast  Free Wi-Fi  Free parking      Pool   
4  3-star hotel       Breakfast  Free Wi-Fi  Free parking   Kitchen   

          Feature_6             Feature_7        Feature_8     Feature_9  \
0  Air conditioning        Fitness center              Spa    Restaurant   
1  Air condi

In [None]:
# Step 2: Data Preprocessing
# Fill missing values
df.fillna("", inplace=True)

# Encode categorical variables (City)
le = LabelEncoder()
df['City_Encoded'] = le.fit_transform(df['City'])

# Combine hotel features into a single text column
df['Features'] = df[['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9']].astype(str).agg(' '.join, axis=1)


In [None]:
# Step 3: Content-Based Filtering
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['Features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_hotels(hotel_name, num_recommendations=5):
    if hotel_name not in df['Hotel_Name'].values:
        print(f"Hotel '{hotel_name}' not found in dataset.")
        return pd.DataFrame()

    idx = df[df['Hotel_Name'] == hotel_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    hotel_indices = [i[0] for i in sim_scores]
    return df.iloc[hotel_indices]



In [None]:
# Step 4: Collaborative Filtering (Matrix Factorization)
# Normalize Hotel Ratings
scaler = StandardScaler()
df['Hotel_Rating_Scaled'] = scaler.fit_transform(df[['Hotel_Rating']])

# Creating a user-hotel rating matrix (Simulated Data)
np.random.seed(42)
num_users = 500
df_ratings = pd.DataFrame({
    'user_id': np.random.randint(1, num_users + 1, df.shape[0]),
    'hotel_id': df.index,
    'rating': np.random.uniform(1, 5, df.shape[0])  # Simulated ratings
})
user_hotel_matrix = df_ratings.pivot(index='user_id', columns='hotel_id', values='rating').fillna(0)

# Ensure matrix contains only numeric values and convert to float
user_hotel_matrix = user_hotel_matrix.astype(np.float64)

# Perform Singular Value Decomposition (SVD)
U, sigma, Vt = svds(user_hotel_matrix.values, k=min(50, user_hotel_matrix.shape[1]-1))
sigma = np.diag(sigma)  # Convert singular values into diagonal matrix

# Compute predicted ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Convert back to DataFrame
predictions_df = pd.DataFrame(predicted_ratings, index=user_hotel_matrix.index, columns=user_hotel_matrix.columns)

def recommend_hotels_collaborative(user_id, num_recommendations=5):
    if user_id not in predictions_df.index:
        print(f"User ID '{user_id}' not found in dataset.")
        return pd.DataFrame()

    user_ratings = predictions_df.loc[user_id].sort_values(ascending=False)
    return df.iloc[user_ratings.index[:num_recommendations]]

In [None]:
# Step 5: Hybrid Recommendation
def hybrid_recommendation(user_id, hotel_name, num_recommendations=5):
    content_recs = recommend_hotels(hotel_name, num_recommendations)
    collab_recs = recommend_hotels_collaborative(user_id, num_recommendations)

    if content_recs.empty and collab_recs.empty:
        print("No recommendations found for given input.")
        return pd.DataFrame()

    hybrid_recs = pd.concat([content_recs, collab_recs]).drop_duplicates().head(num_recommendations)
    return hybrid_recs

# Example usage
print(hybrid_recommendation(user_id=1, hotel_name="Example Hotel"))


Hotel 'Example Hotel' not found in dataset.
                                             Hotel_Name  Hotel_Rating  \
891   Hotel apple grand and grand darshan restaurant...           4.4   
609                              Hotel Ruchi The Prince           4.2   
930                                      Hotel Sai Rama           4.4   
1040                        Cambay Sapphire Gandhinagar           3.4   
122                                KTDC Grand Chaithram           4.1   

             City       Feature_1       Feature_2     Feature_3     Feature_4  \
891      vadodara  Free breakfast      Free Wi-Fi  Free parking          Pool   
609        mysore    4-star hotel  Free breakfast         Wi-Fi  Free parking   
930    vijayawada  Free breakfast      Free Wi-Fi  Paid parking          Pool   
1040  gandhinagar    4-star hotel  Free breakfast    Free Wi-Fi  Free parking   
122    trivandrum    3-star hotel  Free breakfast    Free Wi-Fi  Free parking   

             Feature_5        