<a href="https://colab.research.google.com/github/Kushl143/feynee-labs/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Step 1: Load and Explore Dataset
df = pd.read_csv("/content/google_hotel_data_clean_v1.csv")
print(df.head())
print(df.info())
print(df.isnull().sum())

                                 Hotel_Name  Hotel_Rating   City  \
0                      Trident Hotel Cochin           4.4  kochi   
1  The Gateway Hotel Marine Drive Ernakulam           4.3  kochi   
2                   Ramada by Wyndham Kochi           4.5  kochi   
3                          The Renai cochin           4.2  kochi   
4                 SpringField Billets Hotel           4.2  kochi   

      Feature_1       Feature_2   Feature_3     Feature_4 Feature_5  \
0  5-star hotel  Free breakfast       Wi-Fi  Free parking      Pool   
1  5-star hotel  Free breakfast  Free Wi-Fi  Free parking      Pool   
2  5-star hotel       Breakfast  Free Wi-Fi  Free parking      Pool   
3  4-star hotel  Free breakfast  Free Wi-Fi  Free parking      Pool   
4  3-star hotel       Breakfast  Free Wi-Fi  Free parking   Kitchen   

          Feature_6             Feature_7        Feature_8     Feature_9  \
0  Air conditioning        Fitness center              Spa    Restaurant   
1  Air condi

In [4]:
# Step 2: Data Preprocessing
# Fill missing values
df.fillna("", inplace=True)

# Encode categorical variables (City)
le = LabelEncoder()
df['City_Encoded'] = le.fit_transform(df['City'])

# Combine hotel features into a single text column
df['Features'] = df[['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9']].astype(str).agg(' '.join, axis=1)


In [5]:
# Step 3: Content-Based Filtering
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['Features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_hotels(hotel_name, num_recommendations=5):
    if hotel_name not in df['Hotel_Name'].values:
        print(f"Hotel '{hotel_name}' not found in dataset.")
        return pd.DataFrame()

    idx = df[df['Hotel_Name'] == hotel_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    hotel_indices = [i[0] for i in sim_scores]
    return df.iloc[hotel_indices]
