In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import json

# Load your dataset
df = pd.read_csv('final_merged_sheet.csv')
print("Initial shape:", df.shape)
print("Initial columns:", df.columns.tolist())

# --- 1. DROP UNNECESSARY COLUMNS ---
cols_to_drop = [
    'check_in_time', 'check_out_time', 'phone_number', 'email', 
    'website', 'booking_url', 'data_source', 'address', 
    'Distance_to_Landmark'  # Drop text version, keep Distance_to_Landmark(in_km)
]
df_clean = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
print(f"After dropping columns: {df_clean.shape}")

# --- 2. CLEAN CITY AND STATE COLUMNS ---
# Fill missing city/state values
df_clean['city'] = df_clean['city'].fillna('Unknown')
df_clean['state'] = df_clean['state'].fillna('Unknown')

# Convert to string to ensure consistency
df_clean['city'] = df_clean['city'].astype(str)
df_clean['state'] = df_clean['state'].astype(str)

# Clean any coordinate values that might be in city/state columns
df_clean['city'] = df_clean['city'].apply(
    lambda x: 'Unknown' if (isinstance(x, str) and 
                           (x.replace('.', '').replace('-', '').isdigit() or 
                            len(x) > 20)) else x
)
df_clean['state'] = df_clean['state'].apply(
    lambda x: 'Unknown' if (isinstance(x, str) and 
                           (x.replace('.', '').replace('-', '').isdigit() or 
                            len(x) > 20)) else x
)

# --- 3. HANDLE MISSING VALUES ---

# A. Numerical columns - fill with median
numerical_cols = ['rating', 'review_count', 'price_per_night', 'tax', 'guest_rating']
for col in numerical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# B. Star rating - keep as Unknown for missing values
df_clean['star_rating'] = df_clean['star_rating'].fillna(-1)  # Use -1 for Unknown
df_clean['star_rating_clean'] = df_clean['star_rating'].apply(
    lambda x: 'Unknown' if x == -1 else str(int(x))
)

# C. Categorical columns
categorical_cols = ['rating_description', 'nearest_landmark', 'price_range', 
                   'location', 'hotel_type', 'room_types']
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna('Unknown')

# D. Distance column - fill missing with -1 for unknown
if 'Distance_to_Landmark(in_km)' in df_clean.columns:
    df_clean['Distance_to_Landmark(in_km)'] = df_clean['Distance_to_Landmark(in_km)'].fillna(-1)

# E. Coordinates - fill with India center coordinates
if 'latitude' in df_clean.columns and 'longitude' in df_clean.columns:
    # Option 1: Create a flag for missing coordinates
    df_clean['has_coordinates'] = (~df_clean['latitude'].isna() & ~df_clean['longitude'].isna()).astype(int)
    
    # Option 2: Drop coordinate columns if too many missing values (>50%)
    lat_missing_pct = df_clean['latitude'].isna().mean()
    lon_missing_pct = df_clean['longitude'].isna().mean()
    
    if lat_missing_pct > 0.5 or lon_missing_pct > 0.5:
        print(f"Dropping coordinates due to high missingness: lat={lat_missing_pct:.1%}, lon={lon_missing_pct:.1%}")
        df_clean = df_clean.drop(columns=['latitude', 'longitude'])
    else:
        # If keeping coordinates, fill with median of available coordinates
        df_clean['latitude'] = df_clean['latitude'].fillna(df_clean['latitude'].median())
        df_clean['longitude'] = df_clean['longitude'].fillna(df_clean['longitude'].median())

# F. Handle amenities - automated extraction
def extract_all_amenities(amenities_str):
    if pd.isna(amenities_str) or amenities_str == '':
        return []
    
    # Split by comma and clean each amenity
    amenities_list = [a.strip() for a in str(amenities_str).split(',')]
    return amenities_list

if 'amenities' in df_clean.columns:
    df_clean['amenities'] = df_clean['amenities'].fillna('')
    
    # Extract all individual amenities
    all_amenities_lists = df_clean['amenities'].apply(extract_all_amenities)
    
    # Flatten the list of lists and get unique amenities
    all_individual_amenities = []
    for amenity_list in all_amenities_lists:
        all_individual_amenities.extend(amenity_list)
    
    # Get unique amenities and their frequencies
    from collections import Counter
    amenity_counter = Counter(all_individual_amenities)
    
    # Get the most common amenities (let's take those that appear at least 10 times)
    common_amenities = [amenity for amenity, count in amenity_counter.items() if count >= 10]
    
    print(f"Found {len(common_amenities)} common amenities (appearing ≥10 times):")
    for amenity in common_amenities:
        print(f"  {amenity}: {amenity_counter[amenity]} occurrences")
    
    # Create binary columns for each common amenity
    for amenity in common_amenities:
        # Create a clean column name
        col_name = f"amenity_{amenity.lower().replace(' ', '_').replace('/', '_').replace('-', '_')}"
        col_name = col_name.replace('__', '_').strip('_')
        
        df_clean[col_name] = all_amenities_lists.apply(lambda x: 1 if amenity in x else 0)
    
    # Count total amenities
    df_clean['total_amenities_count'] = all_amenities_lists.apply(len)
    
    # Drop original amenities column
    df_clean = df_clean.drop(columns=['amenities'])

# G. Description - drop this column as requested (TF-IDF gives unwanted results)
if 'description' in df_clean.columns:
    df_clean = df_clean.drop(columns=['description'])

# H. Data quality score - keep as is if exists
if 'data_quality_score' in df_clean.columns:
    df_clean['data_quality_score'] = df_clean['data_quality_score'].fillna(df_clean['data_quality_score'].median())

# --- 4. ENCODE CATEGORICAL VARIABLES (SMART APPROACH) ---

# For high cardinality categoricals like city/state, use label encoding
# For low cardinality ones, use one-hot encoding

""""high_cardinality_cols = ['city', 'state', 'hotel_name', 'location', 'nearest_landmark']
low_cardinality_cols = ['rating_description', 'price_range','hotel_type', 'room_types', 'star_rating_clean']

# Store all encoding mappings
encoding_mappings = {}

# Label encoding for high cardinality
label_encoders = {}
for col in high_cardinality_cols:
    if col in df_clean.columns:
        le = LabelEncoder()
        df_clean[f'{col}_encoded'] = le.fit_transform(df_clean[col].astype(str))
        label_encoders[col] = le
        
        # Store the mapping
        encoding_mappings[col] = {
            'encoding_type': 'label_encoding',
            'mapping': {int(i): str(value) for i, value in enumerate(le.classes_)}
        }

# One-hot encoding for low cardinality (limit categories)
one_hot_mappings = {}
for col in low_cardinality_cols:
    if col in df_clean.columns:
        # Limit to top categories to avoid explosion
        value_counts = df_clean[col].value_counts()
        top_categories = value_counts[value_counts > 10].index  # Only keep categories with more than 10 occurrences
        
        if len(top_categories) > 0:
            df_clean[col] = df_clean[col].apply(
                lambda x: x if x in top_categories else 'Other'
            )
            
            # Store the one-hot mapping
            one_hot_mappings[col] = {
                'encoding_type': 'one_hot_encoding',
                'categories': list(df_clean[col].unique())
            }
            
            # One-hot encode
            dummies = pd.get_dummies(df_clean[col], prefix=col)
            df_clean = pd.concat([df_clean, dummies], axis=1)

# Add one-hot mappings to the main encoding mappings
encoding_mappings.update(one_hot_mappings)

# Drop original categorical columns (keep the encoded versions)
categorical_to_drop = high_cardinality_cols + low_cardinality_cols + ['star_rating']
df_clean = df_clean.drop(columns=[col for col in categorical_to_drop if col in df_clean.columns])"""

# --- 5. FINAL CLEANUP (NO SCALING - as requested) ---
# Remove any constant columns
df_clean = df_clean.loc[:, df_clean.nunique() > 1]

print(f"Final dataset shape: {df_clean.shape}")
print(f"Final columns: {len(df_clean.columns)}")

# Display info about the cleaned dataset
print("\nDataset Info:")
print(df_clean.info())

print(f"\nMissing values per column:")
missing_counts = df_clean.isnull().sum()
print(missing_counts[missing_counts > 0])

print(f"\nData types:")
print(df_clean.dtypes.value_counts())


  from scipy.sparse import csr_matrix, issparse


Initial shape: (4632, 29)
Initial columns: ['hotel_name', 'rating', 'rating_description', 'review_count', 'star_rating', 'location', 'nearest_landmark', 'Distance_to_Landmark', 'Distance_to_Landmark(in_km)', 'price_per_night', 'tax', 'city', 'state', 'address', 'price_range', 'latitude', 'longitude', 'amenities', 'hotel_type', 'phone_number', 'email', 'website', 'booking_url', 'description', 'check_in_time', 'check_out_time', 'room_types', 'data_source', 'data_quality_score']
After dropping columns: (4632, 20)
Found 19 common amenities (appearing ≥10 times):
  Free WiFi: 3943 occurrences
  Air Conditioning: 3924 occurrences
  24/7 Front Desk: 3794 occurrences
  Spa: 498 occurrences
  Pool: 453 occurrences
  Gym: 660 occurrences
  Restaurant: 673 occurrences
  Room Service: 404 occurrences
  Business Center: 141 occurrences
  Meeting Rooms: 145 occurrences
  Bar: 65 occurrences
  Swimming Pool: 53 occurrences
  Parking: 58 occurrences
  Travel Desk: 56 occurrences
  Airport Shuttle: 110

In [3]:
# Generate random hotel IDs from 1 to the number of rows
np.random.seed(42)  # For reproducibility
random_ids = np.random.permutation(len(df_clean)) + 1

# Add hotel_id column with random IDs
df_clean.insert(1, 'hotel_id', random_ids)

In [4]:
df_clean.columns

Index(['hotel_name', 'hotel_id', 'rating', 'rating_description',
       'review_count', 'star_rating', 'location', 'nearest_landmark',
       'Distance_to_Landmark(in_km)', 'price_per_night', 'tax', 'city',
       'state', 'price_range', 'latitude', 'longitude', 'hotel_type',
       'room_types', 'data_quality_score', 'star_rating_clean',
       'has_coordinates', 'amenity_free_wifi', 'amenity_air_conditioning',
       'amenity_24_7_front_desk', 'amenity_spa', 'amenity_pool', 'amenity_gym',
       'amenity_restaurant', 'amenity_room_service', 'amenity_business_center',
       'amenity_meeting_rooms', 'amenity_bar', 'amenity_swimming_pool',
       'amenity_parking', 'amenity_travel_desk', 'amenity_airport_shuttle',
       'amenity_conference_hall', 'amenity_laundry_service',
       'amenity_valet_parking', 'amenity_concierge', 'total_amenities_count'],
      dtype='object')

In [5]:
df_clean = df_clean.drop(["amenity_laundry_service", "amenity_concierge"], axis=1)  

In [6]:
# Save the preprocessed data
df_clean.to_csv('preprocessed_hotel_data_final_new.csv', index=False)
print("\nPreprocessed data saved as 'preprocessed_hotel_data_final_new.csv'")


Preprocessed data saved as 'preprocessed_hotel_data_final_new.csv'


In [6]:
""""
# Save encoding mappings to a separate JSON file
with open('encoding_mappings.json', 'w') as f:
    json.dump(encoding_mappings, f, indent=2)
print("Encoding mappings saved as 'encoding_mappings.json'")

# Show sample of the processed data
print(f"\nSample of processed data:")
print(df_clean.head())

# Show encoding mappings for reference
print("\nEncoding mappings:")
for col, mapping_info in encoding_mappings.items():
    print(f"\n{col} encoding ({mapping_info['encoding_type']}):")
    if mapping_info['encoding_type'] == 'label_encoding':
        for code, value in list(mapping_info['mapping'].items())[:10]:  # Show first 10
            print(f"  {code}: {value}")
        if len(mapping_info['mapping']) > 10:
            print(f"  ... and {len(mapping_info['mapping']) - 10} more")
    else:  # one_hot_encoding
        print(f"  Categories: {', '.join(mapping_info['categories'][:10])}")
        if len(mapping_info['categories']) > 10:
            print(f"  ... and {len(mapping_info['categories']) - 10} more")

# Also create a CSV version of the mappings for easier reading
mappings_df_data = []
for col, mapping_info in encoding_mappings.items():
    if mapping_info['encoding_type'] == 'label_encoding':
        for code, value in mapping_info['mapping'].items():
            mappings_df_data.append({
                'column': col,
                'encoding_type': 'label_encoding',
                'encoded_value': code,
                'original_value': value
            })
    else:  # one_hot_encoding
        for category in mapping_info['categories']:
            mappings_df_data.append({
                'column': col,
                'encoding_type': 'one_hot_encoding',
                'encoded_value': f"{col}_{category}",
                'original_value': category
            })

mappings_df = pd.DataFrame(mappings_df_data)
mappings_df.to_csv('encoding_mappings_detailed.csv', index=False)
print("Detailed encoding mappings saved as 'encoding_mappings_detailed.csv'")
""""

SyntaxError: unterminated string literal (detected at line 48) (1893385018.py, line 48)

In [7]:
df_clean.columns

Index(['hotel_name', 'hotel_id', 'rating', 'rating_description',
       'review_count', 'star_rating', 'location', 'nearest_landmark',
       'Distance_to_Landmark(in_km)', 'price_per_night', 'tax', 'city',
       'state', 'price_range', 'latitude', 'longitude', 'hotel_type',
       'room_types', 'data_quality_score', 'star_rating_clean',
       'has_coordinates', 'amenity_free_wifi', 'amenity_air_conditioning',
       'amenity_24_7_front_desk', 'amenity_spa', 'amenity_pool', 'amenity_gym',
       'amenity_restaurant', 'amenity_room_service', 'amenity_business_center',
       'amenity_meeting_rooms', 'amenity_bar', 'amenity_swimming_pool',
       'amenity_parking', 'amenity_travel_desk', 'amenity_airport_shuttle',
       'amenity_conference_hall', 'amenity_valet_parking',
       'total_amenities_count'],
      dtype='object')