# ===   EDA. Predict rating hotels  ===

## Import Libraries and Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.feature_selection import chi2, f_classif
import category_encoders as ce
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download necessary resources for sentiment analysis
nltk.download('vader_lexicon')

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load and Combine Train and Test Data

In [None]:
DATA_DIR = '/kaggle/input/sf-booking/'

df_train = pd.read_csv(DATA_DIR + '/hotels_train.csv')
df_test = pd.read_csv(DATA_DIR + '/hotels_test.csv')
sample_submission = pd.read_csv(DATA_DIR + '/submission.csv')

# Add sample flags
df_train['sample'] = 1  # training data
df_test['sample'] = 0   # test data
df_test['reviewer_score'] = 0  # placeholder for target

# Combine train and test for consistent preprocessing
data = pd.concat([df_test, df_train], axis=0).reset_index(drop=True)
print(f"Combined dataset shape: {data.shape}")

In [None]:
data

# Initial Exploration & Cleaning

In [None]:
# Create a copy for feature engineering
n_hotels = data.copy()

In [None]:
#data['hotel_name'].nunique()
# Проверим на наличие дубликатов и отфильтруем их 

L0=len(n_hotels)
n_hotels.drop_duplicates(inplace=True)
L1=len(n_hotels)

print('Обнаружено {} дубликатов'.format(L0-L1))


In [None]:
print("Number of unique hotels:", n_hotels['hotel_name'].nunique())

# Convert review date to datetime format
n_hotels['review_date'] = pd.to_datetime(n_hotels['review_date'])



In [None]:
# Строим тепловую карту, где желтым отмечены пропущеные значения
cbar_kws = { 'ticks' : [0,1] }
colors = ['blue', 'yellow'] 
fig = plt.figure(figsize=(10, 4))
ax = sns.heatmap(
    n_hotels.T.isnull(), # Создаем карту пропущенных значений в базе данных
    cmap=sns.color_palette(colors),
    xticklabels=False,
    cbar_kws = cbar_kws
)
ax.set_title('Пропущенные данные в базе');

**Observation:** There missing lat lng for some positions in our data set

## Filling in the gaps in geographic coordinates. (Feature generation)

In [None]:
n_hotels.columns

In [None]:
n_hotels[n_hotels['lat'].isnull()][['hotel_name', 'hotel_address']]


In [None]:
n_hotels[['hotel_name', 'hotel_address', 'lat', 'lng']].head(5)

In [None]:
# Сначала подключим необходимые библиотеку и модуль для поиска географических координат 
# #!pip install geopy

#from geopy.geocoders import Nominatim
#geolocator = Nominatim(user_agent="AzureMaps")
#location = geolocator.geocode( n_hotels.iloc[0]['hotel_address'])

#  Проверка
#print(location.address)
#print((location.latitude, location.longitude))

In [None]:
# =====================================================
#  Step: Filling Missing Geographic Coordinates
# =====================================================

# Install geopy if not available
# !pip install geopy

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Initialize geocoder (use a descriptive user agent to avoid blocking)
geolocator = Nominatim(user_agent="HotelGeoFinder")

# Add a rate limiter to avoid "Too Many Requests" errors
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

missing_coords = n_hotels[n_hotels['lng'].isnull()]['hotel_address'].unique()
print(f"Found {len(missing_coords)} addresses with missing coordinates.")

# Define helper functions with error handling
def get_lat(address):
    try:
        shortaddress = " ".join(address.split()[-3:])  # use last 3 elements (City, Region, Country)
        location = geocode(shortaddress)
        return location.latitude if location else np.nan
    except Exception as e:
        print(f"Error getting latitude for {address}: {e}")
        return np.nan

def get_lng(address):
    try:
        shortaddress = " ".join(address.split()[-3:])
        location = geocode(shortaddress)
        return location.longitude if location else np.nan
    except Exception as e:
        print(f"Error getting longitude for {address}: {e}")
        return np.nan

# Build dictionaries to map addresses → coordinates
lat_dict = {addr: get_lat(addr) for addr in missing_coords}
lng_dict = {addr: get_lng(addr) for addr in missing_coords}

# Fill missing values in main dataset
n_hotels['lat'] = n_hotels['lat'].fillna(n_hotels['hotel_address'].map(lat_dict))
n_hotels['lng'] = n_hotels['lng'].fillna(n_hotels['hotel_address'].map(lng_dict))



In [None]:

n_hotels[n_hotels['lat'].isnull()][['hotel_name', 'hotel_address']]

In [None]:
def extract_country(address):
    """Extract country name from a hotel address string."""
    if pd.isna(address) or address.strip() == '':
        return None
    
    # Split by comma — country is usually the last segment
    parts = address.split(',')
    last_part = parts[-1].strip() if len(parts) > 0 else address.strip()

    # Handle cases where address might not use commas
    if ' ' in last_part:
        # In some addresses, country may be the last word
        return last_part.split()[-1].strip()
    else:
        return last_part

# Apply the extraction
n_hotels['hotel_country'] = n_hotels['hotel_address'].apply(extract_country)

# Quick sanity check
print(n_hotels['hotel_country'].value_counts().head(20))


In [None]:
country_fix = {
    'Kingdom': 'United Kingdom',
    'Spain': 'Spain',
    'France': 'France',
    'Netherlands': 'Netherlands',
    'Austria': 'Austria',
    'Italy': 'Italy'
}

n_hotels['hotel_country'] = n_hotels['hotel_country'].replace(country_fix)


In [None]:
n_hotels['domestic_traveler_flag'] = (
    n_hotels['hotel_country'].str.strip().str.lower() ==
    n_hotels['reviewer_nationality'].str.strip().str.lower()
).astype(int)

print(n_hotels['domestic_traveler_flag'].value_counts())


In [None]:
continent_map = {
    'europe': ['united kingdom', 'france', 'germany', 'spain', 'italy', 'netherlands', 'belgium', 'sweden', 'norway', 'switzerland', 'portugal', 'poland', 'austria', 'greece', 'ireland', 'finland', 'czech republic', 'denmark', 'hungary', 'slovakia', 'romania', 'bulgaria', 'croatia', 'slovenia', 'iceland'],
    'asia': ['china', 'india', 'japan', 'malaysia', 'singapore', 'thailand', 'south korea', 'indonesia', 'philippines', 'vietnam', 'pakistan', 'bangladesh'],
    'americas': ['united states of america', 'canada', 'mexico', 'brazil', 'argentina', 'chile', 'colombia', 'peru'],
    'middle_east': ['united arab emirates', 'saudi arabia', 'qatar', 'kuwait', 'bahrain', 'oman', 'lebanon'],
    'africa': ['south africa', 'morocco', 'egypt', 'kenya', 'nigeria', 'ghana'],
    'oceania': ['australia', 'new zealand']
}

def get_continent(country):
    if pd.isna(country) or country.strip() == '':
        return 'unknown'
    country = country.strip().lower()
    for cont, countries in continent_map.items():
        if country in countries:
            return cont
    return 'other'

n_hotels['hotel_continent'] = n_hotels['hotel_country'].apply(get_continent)
n_hotels['reviewer_continent'] = n_hotels['reviewer_nationality'].apply(get_continent)

n_hotels['cross_region_flag'] = (
    (n_hotels['hotel_continent'] != n_hotels['reviewer_continent']).astype(int)
)

print(n_hotels[['hotel_country', 'reviewer_nationality', 'domestic_traveler_flag']].sample(10))
print(n_hotels['cross_region_flag'].value_counts(normalize=True))

In [None]:
# Drop address columns (typos and duplicates)
n_hotels.drop(['hotel_name', 'hotel_address'], axis=1, inplace=True)

# Final check
print("Remaining missing coordinates:")
print(n_hotels[['lat', 'lng']].isnull().sum())

## Nationality into Travel distance

In [None]:
# Dictionary mapping reviewer nationalities to approximate lat/lng coordinates
reviewer_nationality_coords = {
    ' United Kingdom ': (55.3781, -3.4360),
    ' Belgium ': (50.5039, 4.4699),
    ' Sweden ': (60.1282, 18.6435),
    ' United States of America ': (37.0902, -95.7129),
    ' Ecuador ': (-1.8312, -78.1834),
    ' Netherlands ': (52.1326, 5.2913),
    ' Ireland ': (53.1424, -7.6921),
    ' Canada ': (56.1304, -106.3468),
    ' Norway ': (60.4720, 8.4689),
    ' Bulgaria ': (42.7339, 25.4858),
    ' Italy ': (41.8719, 12.5674),
    ' Australia ': (-25.2744, 133.7751),
    ' Seychelles ': (-4.6796, 55.4920),
    ' Kuwait ': (29.3759, 47.9774),
    ' Saudi Arabia ': (23.8859, 45.0792),
    ' Czech Republic ': (49.8175, 15.4730),
    ' France ': (46.2276, 2.2137),
    ' Germany ': (51.1657, 10.4515),
    ' South Africa ': (-30.5595, 22.9375),
    ' United Arab Emirates ': (23.4241, 53.8478),
    ' Greece ': (39.0742, 21.8243),
    ' Spain ': (40.4637, -3.7492),
    ' Switzerland ': (46.8182, 8.2275),
    ' Macedonia ': (41.6086, 21.7453),
    ' Poland ': (51.9194, 19.1451),
    ' Bahrain ': (26.0667, 50.5577),
    ' Qatar ': (25.3548, 51.1839),
    ' India ': (20.5937, 78.9629),
    ' Singapore ': (1.3521, 103.8198),
    ' Malaysia ': (4.2105, 101.9758),
    ' Thailand ': (15.8700, 100.9925),
    ' Brazil ': (-14.2350, -51.9253),
    ' Crimea ': (45.0, 34.0),
    ' Turkey ': (38.9637, 35.2433),
    ' Israel ': (31.0461, 34.8516),
    ' ': (None, None),
    ' Lebanon ': (33.8547, 35.8623),
    ' Romania ': (45.9432, 24.9668),
    ' Cyprus ': (35.1264, 33.4299),
    ' Portugal ': (39.3999, -8.2245),
    ' Slovakia ': (48.6690, 19.6990),
    ' Jersey ': (49.2144, -2.1313),
    ' Gibraltar ': (36.1408, -5.3536),
    ' Austria ': (47.5162, 14.5501),
    ' Kenya ': (-0.0236, 37.9062),
    ' Isle of Man ': (54.2361, -4.5481),
    ' Costa Rica ': (9.7489, -83.7534),
    ' Oman ': (21.5126, 55.9233),
    ' Hungary ': (47.1625, 19.5033),
    ' Iceland ': (64.9631, -19.0208),
    ' Estonia ': (58.5953, 25.0136),
    ' Hong Kong ': (22.3964, 114.1095),
    ' China ': (35.8617, 104.1954),
    ' Malta ': (35.9375, 14.3754),
    ' Pakistan ': (30.3753, 69.3451),
    ' Montenegro ': (42.7087, 19.3744),
    ' Slovenia ': (46.1512, 14.9955),
    ' South Korea ': (35.9078, 127.7669),
    ' Ukraine ': (48.3794, 31.1656),
    ' Japan ': (36.2048, 138.2529),
    ' Azerbaijan ': (40.1431, 47.5769),
    ' Russia ': (61.5240, 105.3188),
    ' Brunei ': (4.5353, 114.7277),
    ' Cayman Islands ': (19.3133, -81.2546),
    ' Serbia ': (44.0165, 21.0059),
    ' Argentina ': (-38.4161, -63.6167),
    ' Denmark ': (56.2639, 9.5018),
    ' Egypt ': (26.8206, 30.8025),
    ' Finland ': (61.9241, 25.7482),
    ' Mexico ': (23.6345, -102.5528),
    ' Taiwan ': (23.6978, 120.9605),
    ' Peru ': (-9.1899, -75.0152),
    ' Philippines ': (12.8797, 121.7740),
    ' New Zealand ': (-40.9006, 174.8860),
    ' Luxembourg ': (49.8153, 6.1296),
    ' Morocco ': (31.7917, -7.0926),
    ' Latvia ': (56.8796, 24.6032),
    ' Armenia ': (40.0691, 45.0382),
    ' Indonesia ': (-0.7893, 113.9213),
    ' Mauritius ': (-20.3484, 57.5522),
    ' Croatia ': (45.1, 15.2),
    ' Iraq ': (33.2232, 43.6793),
    ' Namibia ': (-22.9576, 18.4904),
    ' Iran ': (32.4279, 53.6880),
    ' Bangladesh ': (23.6850, 90.3563),
    ' Kosovo ': (42.6026, 20.9026),
    ' Tunisia ': (33.8869, 9.5375),
    ' Kazakhstan ': (48.0196, 66.9237),
    ' Sri Lanka ': (7.8731, 80.7718),
    ' Senegal ': (14.4974, -14.4524),
    ' Guernsey ': (49.4657, -2.5857),
    ' Bosnia and Herzegovina ': (43.9159, 17.6791),
    ' Chile ': (-35.6751, -71.5430),
    ' Jordan ': (30.5852, 36.2384),
    ' Lithuania ': (55.1694, 23.8813),
    ' Trinidad and Tobago ': (10.6918, -61.2225),
    ' Albania ': (41.1533, 20.1683),
    ' Yemen ': (15.5527, 48.5164),
    ' Vietnam ': (14.0583, 108.2772),
    ' Macau ': (22.1987, 113.5439),
    ' Abkhazia Georgia ': (43.0, 41.0),
    ' Puerto Rico ': (18.2208, -66.5901),
    ' Nigeria ': (9.0820, 8.6753),
    ' Georgia ': (42.3154, 43.3569),
    ' Guatemala ': (15.7835, -90.2308),
    ' Syria ': (34.8021, 38.9968),
    ' Cura ao ': (12.1696, -68.9900),
    ' El Salvador ': (13.7942, -88.8965),
    ' Monaco ': (43.7336, 7.4170),
    ' Algeria ': (28.0339, 1.6596),
    ' Belarus ': (53.7098, 27.9534),
    ' Maldives ': (3.2028, 73.2207),
    ' Colombia ': (4.5709, -74.2973),
    ' Mauritania ': (21.0079, -10.9408),
    ' Venezuela ': (6.4238, -66.5897),
    ' Kyrgyzstan ': (41.2044, 74.7661),
    ' Libya ': (26.3351, 17.2283),
    ' Saint Lucia ': (13.9094, -60.9789),
    ' Tanzania ': (-6.3690, 34.8888),
    ' Andorra ': (42.5462, 1.6016),
    ' Fiji ': (-17.7134, 178.0650),
    ' Moldova ': (47.4116, 28.3699),
    ' Panama ': (8.5380, -80.7821),
    ' Grenada ': (12.1165, -61.6790),
    ' Angola ': (-11.2027, 17.8739),
    ' Ghana ': (7.9465, -1.0232),
    ' Sudan ': (12.8628, 30.2176),
    ' Ivory Coast ': (7.539989, -5.5471),
    ' Myanmar ': (21.9162, 95.9560),
    ' Aruba ': (12.5211, -69.9683),
    ' Uruguay ': (-32.5228, -55.7658),
    ' U S Virgin Islands ': (18.3358, -64.8963),
    ' Mongolia ': (46.8625, 103.8467),
    ' Haiti ': (18.9712, -72.2852),
    ' Tajikistan ': (38.8610, 71.2761),
    ' Cambodia ': (12.5657, 104.9910),
    ' Uzbekistan ': (41.3775, 64.5853),
    ' Dominican Republic ': (18.7357, -70.1627),
    ' Bermuda ': (32.3078, -64.7505),
    ' United States Minor Outlying Islands ': (28.5, -177.0),
    ' Uganda ': (1.3733, 32.2903),
    ' Bahamas ': (25.0343, -77.3963),
    ' Guyana ': (4.8604, -58.9302),
    ' Barbados ': (13.1939, -59.5432),
    ' Zimbabwe ': (-19.0154, 29.1549),
    ' Saint Martin ': (18.0708, -63.0501),
    ' Palestinian Territory ': (31.9522, 35.2332),
    # ... continue for the remaining territories as needed
}



In [None]:
#import json

# Save to JSON file
#with open("reviewer_nationality_coords.json", "w") as f:
#    json.dump(reviewer_nationality_coords, f)

# Load dictionary from JSON file
#with open("/kaggle/working/reviewer_nationality_coords.json", "r") as f:
#    reviewer_nationality_coords = json.load(f)


In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="HotelReviewer", timeout=2)  # increase to 10 seconds


import time

def get_coordinates(country):
    try:
        #country = country.strip()
        if country in reviewer_nationality_coords:
            return tuple(reviewer_nationality_coords[country])  # convert list back to tuple
        if country == '':
            return ((None, None))
        #location = geolocator.geocode(f"{country}, {country}", exactly_one=True, timeout=10)
        #if location:
        #    return location.latitude, location.longitude
        else:
            return ((None, None))
    except Exception as e:
        print(f"Error for country {country}: {e}")
        return ((None, None))

reviewer_nationality_list = n_hotels['reviewer_nationality'].unique()
reviewer_nationality_dict = {country: get_coordinates(country) for country in reviewer_nationality_list}

n_hotels['lat_nationality'], n_hotels['lng_nationality'] = zip(*n_hotels['reviewer_nationality'].map(reviewer_nationality_dict))

median_fill = {
            'lat_nationality': n_hotels['lat_nationality'].median(),
            'lng_nationality': n_hotels['lng_nationality'].median()
            }

n_hotels = n_hotels.fillna(median_fill)
'''
# Replace (0,0) with NaN
n_hotels['lat_nationality'] = n_hotels['lat_nationality'].replace(0, np.nan)
n_hotels['lng_nationality'] = n_hotels['lng_nationality'].replace(0, np.nan)

# Create a flag for missing coordinates
n_hotels['nationality_missing'] = n_hotels['lat_nationality'].isna().astype(int)
'''

In [None]:
# Select rows where either latitude or longitude is NaN
missing_coords = n_hotels[n_hotels['lat_nationality'].isna() | n_hotels['lng_nationality'].isna()]

# Count by nationality
missing_by_country = missing_coords['reviewer_nationality'].value_counts()
print(missing_by_country)


Local visitors often have different expectations → tend to rate differently.

Far travelers might visit more expensive or tourist-oriented hotels → correlation with higher/lower reviewer_score.

It also serves as a proxy for tourism type or international popularity.

In [None]:
from geopy.distance import geodesic
import numpy as np
import pandas as pd

def compute_distance(row):
    """
    Compute travel distance between nationality (home) and hotel coordinates in kilometers.
    Returns NaN if any coordinate is missing.
    """
    try:
        if (pd.notnull(row['lat_nationality']) and pd.notnull(row['lng_nationality']) and
            pd.notnull(row['lat']) and pd.notnull(row['lng'])):
            
            # Skip placeholder (0,0)
            if (row['lat_nationality'] == 0 and row['lng_nationality'] == 0):
                return np.nan
            
            home = (row['lat_nationality'], row['lng_nationality'])
            hotel = (row['lat'], row['lng'])
            return geodesic(home, hotel).km  # returns distance in kilometers
        else:
            return np.nan
    except Exception as e:
        print(f"Error computing distance for row: {e}")
        return np.nan

# Apply the function to each row
n_hotels['travel_distance_km'] = n_hotels.apply(compute_distance, axis=1)

# Optional: fill missing distances with median
median_distance = n_hotels['travel_distance_km'].median()
n_hotels['travel_distance_km'] = n_hotels['travel_distance_km'].fillna(median_distance)

print(" Travel distance feature added!")
print(n_hotels[['reviewer_nationality', 'travel_distance_km']].head())


In [None]:
# Filter for reviewer_nationality = 'Russia'
#hotels_russia = n_hotels[n_hotels['reviewer_nationality'] == 'Russia']

# Display address (if exists), latitude, and longitude
#display(hotels_russia[['reviewer_nationality', 'lat_nationality', 'lng_nationality']].head())

n_hotels.drop(columns=['reviewer_nationality'], inplace=True)
n_hotels.drop(columns=['lat_nationality','lng_nationality'], inplace=True)

In [None]:
# -------------------------------
# Step: Create Travel Zone feature
# -------------------------------

def assign_travel_zone(distance_km):
    """Assign travel zone based on distance (in km)."""
    if pd.isna(distance_km):
        return 0   # Unknown / missing
    elif distance_km < 150:
        return 1   # Local trip
    elif distance_km < 1000:
        return 2   # Nearby / regional trip
    else:
        return 3   # Long-distance / international trip

# Apply the function
n_hotels['travel_zone'] = n_hotels['travel_distance_km'].apply(assign_travel_zone)

# Check distribution
print(n_hotels['travel_zone'].value_counts().sort_index())

In [None]:
zone_counts = n_hotels['travel_zone'].value_counts().sort_index()
zone_percent = (zone_counts / len(n_hotels) * 100).round(2)

zone_summary = pd.DataFrame({
    'Zone': zone_counts.index,
    'Count': zone_counts.values,
    'Percent': zone_percent.values
})

print(zone_summary)

In [None]:
n_hotels.info()

In [None]:
nn_hotels = n_hotels.copy()

## Tag Processing

Approach:

- Avoids repeated column insertions → no PerformanceWarning.

- Handles missing values gracefully.

- Easy to maintain and scalable for large numbers of tags.

- Compatible directly with ML models.

In [None]:
n_hotels = nn_hotels.copy()

In [None]:
n_hotels['tags'].head()

In [None]:
# ============================================
# TAGS CLEANING AND FEATURE EXTRACTION
# ============================================

from collections import Counter
import category_encoders as ce


# -------------------------------
# Step 1: Extract number of nights stayed
# -------------------------------
import re

# Extract the number of nights from tags, e.g., "Stayed 3 nights"
regex_nights = r'Stayed (\d+) night[s]?'
n_hotels['nights'] = n_hotels['tags'].str.extract(regex_nights)
n_hotels['nights'] = n_hotels['nights'].fillna(0).astype(int)

# Remove nights-related tags from the 'tags' column
n_hotels['tags'] = n_hotels['tags'].str.replace(regex_nights, '', regex=True)

# -------------------------------
# Step 2: Pet ownership
# -------------------------------
n_hotels['pet'] = n_hotels['tags'].str.contains('With a pet', regex=False).astype(int)
n_hotels['tags'] = n_hotels['tags'].str.replace('With a pet', '', regex=False)

# -------------------------------
# Step 3: Booking from mobile device
# -------------------------------
n_hotels['from_mobile_device'] = n_hotels['tags'].str.contains('Submitted from a mobile device', regex=False).astype(int)
n_hotels['tags'] = n_hotels['tags'].str.replace('Submitted from a mobile device', '', regex=False)

# -------------------------------
# Step 4: Trip type
# -------------------------------
# Extract trip type tags, e.g., "Leisure trip", "Business trip"
regex_trip = r"(Leisure trip|Business trip|.*? trip)"
n_hotels['trip'] = n_hotels['tags'].str.extract(regex_trip)
# Encode trip type as ordinal
trip_encoder = ce.OrdinalEncoder()
n_hotels['trip'] = trip_encoder.fit_transform(n_hotels[['trip']])
# Remove trip tags from 'tags' column
n_hotels['tags'] = n_hotels['tags'].str.replace(regex_trip, '', regex=True)




In [None]:
# -------------------------------
# Step 5: Simplified Traveler type
# -------------------------------

def categorize_traveler(tag_str):
    """
    Extract traveler type from tag string and simplify to:
    1 = solo, 2 = couple, 3 = group/with others, 0 = unknown
    """
    if pd.isna(tag_str) or tag_str.strip() == '':
        return 0

    # Extract the first meaningful tag
    try:
        tag = tag_str.split(',')[0].split("'")[1].strip()
    except IndexError:
        return 0

    tag_lower = tag.lower()
    if 'solo' in tag_lower:
        return 1
    elif 'couple' in tag_lower:
        return 2
    elif 'group' in tag_lower or 'with' in tag_lower:
        return 3
    else:
        return 0

# Apply the function
n_hotels['traveler_type'] = n_hotels['tags'].apply(categorize_traveler)

# Show distribution
traveler_type_counts = n_hotels['traveler_type'].value_counts().sort_index().reset_index()
traveler_type_counts.columns = ['Traveler_Type', 'Count']
print("Simplified traveler types:")
print(traveler_type_counts)

# Remove traveler_type tag from 'tags' column (optional cleanup)
traveler_type_original = ['Couple','Solo traveler','Family with young children','Group','Family with older children','Travelers with friends']
for key in traveler_type_original: #traveler_type_counts['Traveler_Type'].unique():
    n_hotels['tags'] = n_hotels['tags'].str.replace(str(key), '', regex=False)


In [None]:
# Remove traveler_type tag from 'tags' column
#for key in n_hotels['traveler_type'].unique():
#    n_hotels['tags'] = n_hotels['tags'].str.replace(str(key), '', regex=False)

In [None]:
# -------------------------------
# Step 6: Non-smoking feature
# -------------------------------
n_hotels['NonSmoking'] = n_hotels['tags'].str.contains('Non Smoking', regex=False).astype(int)
n_hotels['tags'] = n_hotels['tags'].str.replace('Non Smoking', '', regex=False)

# -------------------------------
# Step 7: Clean leftover tags string
# -------------------------------
n_hotels['tags'] = n_hotels['tags'].str.replace('[\[\]\'\,]', '', regex=True)
n_hotels['tags'] = n_hotels['tags'].str.strip()

# -------------------------------
# Final backup
# -------------------------------
hotels01 = n_hotels.copy()

# Check processed features
print(n_hotels[['nights', 'pet', 'from_mobile_device', 'trip', 'traveler_type', 'NonSmoking']].head(5))

### 1. Clean and Split Tags Properly

Each tag string might contain multiple supplies separated by spaces, commas, or multiple spaces.
Let’s normalize that first.

In [None]:
import pandas as pd
from collections import Counter
import re

# Example: ensure the column is treated as string and handle NaNs
n_hotels['tags'] = n_hotels['tags'].astype(str).fillna('')

# Split tags — this version is robust against multiple spaces or commas
def split_tags(tag_string):
    # Remove quotes and "None"
    tag_string = tag_string.strip().replace("None", "").strip("'\"")
    if not tag_string:
        return []
    # Split by 2+ spaces, commas, or semicolons
    items = re.split(r'\s{2,}|,|;', tag_string)
    # Strip whitespace around each tag and remove empties
    return [item.strip() for item in items if item.strip()]

# Apply to all rows
n_hotels['tag_list'] = n_hotels['tags'].apply(split_tags)


In [None]:
#  Count Frequency of Each Tag

from collections import Counter

# Flatten list of lists into a single list of tags
all_tags = [tag for tags in n_hotels['tag_list'] for tag in tags]

# Count frequency
supplies_counter = Counter(all_tags)

# Show number of unique tags and top 20
print(f"Number of unique supplies: {len(supplies_counter)}")
print(supplies_counter.most_common(20))


In [None]:
#  Get popular supplies

popular_supplies = [k for k, v in supplies_counter.items() if v > 500]
print(f"Popular supplies (occurrences > 500): {len(popular_supplies)}")
print(popular_supplies)

In [None]:
import matplotlib.pyplot as plt

# Convert to DataFrame for plotting
supplies_df = pd.DataFrame(supplies_counter.most_common(40), columns=['Supply', 'Count'])

plt.figure(figsize=(10,12))
plt.barh(supplies_df['Supply'], supplies_df['Count'])
plt.gca().invert_yaxis()
plt.title("Top 20 Most Common Hotel Tags")
plt.xlabel("Count")
plt.show()


In [None]:
# ============================================
# ROOM FEATURES EXTRACTION
# ============================================

from collections import Counter


# -------------------------------
# Step 1: Split 'tags' into 'room' and 'supplies'
# -------------------------------
def extract_room_supplies(tag_str):
    """Split tag string by 'with': room type vs extra supplies/view"""
    parts = tag_str.split('with')
    if len(parts) == 1:
        return parts[0].strip(), None
    else:
        return parts[0].strip(), parts[1].replace('view', 'View').strip()

n_hotels['room'], n_hotels['supplies'] = zip(*n_hotels['tags'].apply(extract_room_supplies))

# -------------------------------
# Step 2: Room type (Single, Double/Twin, Triple, Studio, Suite)
# -------------------------------
def categorize_room_type(room_str):
    if 'Single' in room_str: return 1
    elif any(x in room_str for x in ['Double or Twin', 'Double', 'Twin', '2 rooms']): return 2
    elif any(x in room_str for x in ['3 rooms', 'Triple Room']): return 3
    elif 'Studio' in room_str: return 1
    elif 'Suite' in room_str: return 1
    else: return 0

n_hotels['room_type'] = n_hotels['room'].apply(categorize_room_type)
print(Counter(n_hotels['room_type']))

# -------------------------------
# Step 3: Room description (Standard, Family, Queen, King, Luxury, Deluxe, Executive, Superior)
# -------------------------------
def categorize_room_description(room_str):
    if any(x in room_str for x in ['Standard', 'Ordinary', 'Classic']): return 1
    elif 'Family' in room_str: return 2
    elif 'Queen' in room_str: return 3
    elif 'King' in room_str: return 3
    elif any(x in room_str for x in ['Luxury', 'Deluxe', 'Executive', 'Superior']): return 3
    else: return 0

n_hotels['room_description'] = n_hotels['room'].apply(categorize_room_description)
print(Counter(n_hotels['room_description']))

# -------------------------------
# Step 4: Room view (binary)
# -------------------------------
n_hotels['view'] = n_hotels['supplies'].str.contains('View', regex=False).fillna(0).astype(int)
print(Counter(n_hotels['view']))

# -------------------------------
# Step 5: Analyze extra supplies (optional)
# -------------------------------
supplies_counter = Counter(n_hotels['supplies'])
popular_supplies = [k for k, v in supplies_counter.items() if v > 500]
print(f"Number of unique supplies: {len(supplies_counter)}")
print(f"Popular supplies (occurrences >500): {len(popular_supplies)}")
print(popular_supplies)

# -------------------------------
# Step 6: Drop unneeded columns
# -------------------------------
#n_hotels.drop(['tags', 'supplies', 'room'], axis=1, inplace=True)
n_hotels.info()


In [None]:
import numpy as np
import pandas as pd



# ==========================================
#  Long stay flag
# ==========================================
n_hotels['long_stay_flag'] = (n_hotels['nights'] > 5).astype(int)



# ==========================================
#  Reviewer experience (log transform)
# ==========================================
n_hotels['reviewer_experience'] = np.log1p(
    n_hotels['total_number_of_reviews_reviewer_has_given']
)

# ==========================================
#  Family flag (from tags)
# ==========================================
n_hotels['family_flag'] = n_hotels['tags'].str.contains(
    'family|children', case=False, na=False
).astype(int)

# ==========================================
#  Room quality flag (from tags)
# ==========================================
n_hotels['room_quality_flag'] = n_hotels['tags'].str.contains(
    'deluxe|suite|superior|executive', case=False, na=False
).astype(int)



# ==========================================
#  Quick sanity check
# ==========================================
feature_summary = n_hotels[[
    'domestic_traveler_flag',
    'cross_region_flag',
    'long_stay_flag',
    'reviewer_experience',
    'family_flag',
    'room_quality_flag'
]].head()

print(feature_summary)


## Encoding of Reviews

In [None]:
# ============================================
# SENTIMENT ENCODING OF REVIEWS
# ============================================

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon if not already present
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sent_analyzer = SentimentIntensityAnalyzer()

# Function to compute the compound sentiment score
def get_compound_score(text):
    """Return the VADER compound sentiment score for a given text."""
    return sent_analyzer.polarity_scores(text)['compound']

# Apply sentiment analysis to positive and negative reviews
n_hotels['positive_score'] = n_hotels['positive_review'].apply(get_compound_score)
n_hotels['negative_score'] = n_hotels['negative_review'].apply(get_compound_score)

# Drop the original text columns as they are now encoded
n_hotels.drop(['positive_review', 'negative_review'], axis=1, inplace=True)

#=========
# Create a new feature for the difference between positive and negative word counts
n_hotels['pos_vs_neg_words'] = (
    n_hotels['review_total_positive_word_counts'] - n_hotels['review_total_negative_word_counts']
)

# Optional: explore the distribution
print(n_hotels['pos_vs_neg_words'].describe())

# Optional: visualize
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
n_hotels['pos_vs_neg_words'].hist(bins=50)
plt.title('Distribution of Positive vs Negative Word Difference')
plt.xlabel('Positive - Negative Word Count')
plt.ylabel('Frequency')
plt.show()

#========
# Create a new feature for the ratio between positive and negative word counts
n_hotels['pos_to_neg_ratio'] = (
    (n_hotels['review_total_positive_word_counts'] + 1) /
    (n_hotels['review_total_negative_word_counts'] + 1)
)

# Check updated dataframe
n_hotels.info()


In [None]:
'''
# ============================================
# TAGS PREPROCESSING
# ============================================

# Convert the string representation of tags to a list
def parse_tags(tag_str):
    """Convert tag string to list of individual tags"""
    if pd.isnull(tag_str):
        return []
    tag_str = tag_str[2:-2]  # remove the leading/trailing brackets
    return tag_str.strip().split("', '")

n_hotels['tags_n'] = n_hotels['tags'].apply(parse_tags)

# Explode the lists to analyze tag frequencies
explode = n_hotels.explode('tags_n')

# Count frequency of each tag
tag_counts = explode['tags_n'].value_counts()

# Select only popular tags (appearing more than 900 times)
popular_tags = tag_counts[tag_counts > 900].index.tolist()

print(f"Total unique tags: {explode['tags_n'].nunique()}")
print(f"Selected top frequent tags: {len(popular_tags)}")

# --------------------------------------------
# Efficient binary encoding of top tags
# --------------------------------------------
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=popular_tags)
tags_encoded = pd.DataFrame(
    mlb.fit_transform(n_hotels['tags_n']),
    columns=mlb.classes_,
    index=n_hotels.index
)

# Merge the encoded tags into the main dataframe
n_hotels = pd.concat([n_hotels, tags_encoded], axis=1)

# Optional: drop the original columns if not needed
n_hotels = n_hotels.drop(['tags', 'tags_n'], axis=1)

# Check the new dataframe
n_hotels.head(3)
'''

In [None]:

# =====================================================
# 9. Transform Days Since Review
# =====================================================
def extract_days(x):
    try:
        return int(str(x)[:3])
    except ValueError:
        return int(str(x)[:2])

n_hotels['days_since_review_n'] = n_hotels['days_since_review'].apply(extract_days)

# ==========================================
#  Recency weight
# ==========================================
n_hotels['recency_weight'] = 1 / np.log1p( n_hotels['days_since_review_n'] + 1 )

n_hotels.drop(columns=['days_since_review'], inplace=True)


In [None]:
n_hotels.info()

In [None]:
'''
# =====================================================
#  Step: Correlation Between Tags and Ratings
# =====================================================
# Goal:
# To find which review tags (like "Business trip", "Solo traveler", etc.)
# are most positively or negatively associated with hotel ratings.

# Select only tag-related columns (they are binary: 0/1)
tag_cols = popular_tags  # list of top tags created earlier
tag_data = n_hotels[tag_cols + ['reviewer_score', 'average_score']]

# Compute correlation matrix
corr_matrix = tag_data.corr()

# Extract correlations of tags with reviewer_score and average_score
corr_with_reviewer = corr_matrix['reviewer_score'].sort_values(ascending=False)
corr_with_average = corr_matrix['average_score'].sort_values(ascending=False)

# Combine results into one DataFrame for easy comparison
tag_corr = pd.DataFrame({
    'Corr_with_ReviewerScore': corr_with_reviewer[tag_cols],
    'Corr_with_AverageScore': corr_with_average[tag_cols]
}).sort_values(by='Corr_with_ReviewerScore', ascending=False)

display(tag_corr.head(10))

# =====================================================
#  Visualization: Tag Correlation with Ratings
# =====================================================

plt.figure(figsize=(10,20))
sns.barplot(
    x='Corr_with_ReviewerScore', 
    y=tag_corr.index, 
    data=tag_corr, 
    palette='viridis'
)
plt.title('Correlation Between Tags and Reviewer Scores')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Tag')
plt.tight_layout()
plt.show()
'''

## Date-Based Features

In [None]:
#n_hotels['year'] = n_hotels['review_date'].dt.year

def get_season(date):
    if date.month in [12, 1, 2]:
        return 'winter'
    elif date.month in [3, 4, 5]:
        return 'spring'
    elif date.month in [6, 7, 8]:
        return 'summer'
    else:
        return 'autumn'

n_hotels['season'] = n_hotels['review_date'].apply(get_season)
# ==========================================
#  Season encoded (cyclical transformation)
# sin/cos encoding helps models understand seasonality
# ==========================================
season_map = {'spring': 0, 'summer': 1, 'autumn': 2, 'winter': 3}
n_hotels['season_num'] = n_hotels['season'].str.lower().map(season_map)

n_hotels['season_sin'] = np.sin(2 * np.pi * n_hotels['season_num'] / 4)
n_hotels['season_cos'] = np.cos(2 * np.pi * n_hotels['season_num'] / 4)


n_hotels.drop(columns=['review_date'], inplace=True)

In [None]:
# One-hot encode the 'season' column
season_dummies = pd.get_dummies(n_hotels['season'], prefix='season')

# Join encoded columns back to the dataset
n_hotels = pd.concat([n_hotels, season_dummies], axis=1)

# Drop the original 'season' column
#n_hotels.drop('season', axis=1, inplace=True)

# Check result
n_hotels.head()

In [None]:
n_hotels.info()

In [None]:
# Keep only numeric columns
#n_hotels_numeric = n_hotels.select_dtypes(include=['number']).copy()

# Check resulting columns
#print(n_hotels_numeric.columns)

n_hotels = n_hotels.select_dtypes(include=['number'])

# Exploratory data analysis (EDA)
Before feeding our cleaned dataset into a machine learning model, it’s crucial to analyze it and understand the distributions, potential outliers, and correlations. This step helps avoid garbage-in, garbage-out scenarios.

In [None]:
# Check missing values
missing = n_hotels.isnull().sum()
missing = missing[missing > 0]
print("Columns with missing values:\n", missing)


## Basic statistics

In [None]:
# Summary statistics for numerical columns
print(n_hotels.describe().T)

##  Log-transform skewed numerical features

In [None]:
import numpy as np

skewed_cols = ['days_since_review_n', 'total_number_of_reviews', 'additional_number_of_scoring']

for col in skewed_cols:
    n_hotels[col + '_log'] = np.log1p(n_hotels[col])  # log(1+x) to handle zeros


In [None]:
n_hotels[['days_since_review_n', 'days_since_review_n_log']].hist(figsize=(12,4))


## Visualize distributions of numerical features

In [None]:
n_hotels.columns

In [None]:
#n_hotels.drop('tag_list', axis=1, inplace=True)


In [None]:
# ==========================
# Identify categorical and numeric columns
# ==========================

# categorical features
cat_cols = [ 'sample', 'pet', 'from_mobile_device', 'trip', 'traveler_type', 
            'NonSmoking', 'room_type', 'room_description', 'view', 'positive_score',
            'negative_score', 'season_autumn', 'season_spring', 'season_summer', 'season_winter']

# numerical features
num_cols = list(n_hotels.drop(columns=cat_cols, axis=1))

num_cols, cat_cols

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#num_cols = n_hotels.select_dtypes(include=['int64', 'float64']).columns.tolist()

plt.figure(figsize=(16, 20))
for i, col in enumerate(num_cols, 1):
    plt.subplot(len(num_cols)//3 + 1, 3, i)
    sns.boxplot(x=n_hotels[col])
    plt.title(col)
plt.tight_layout()
plt.show()


## Outliers

**Tree-based models** split by thresholds, so extreme values won’t “pull” the model the way they would in regression.

In [None]:
#   3-sigma method (Tukey's method) to identify outliers in numerical features
import pandas as pd

def outliers_iqr(data, feature, k=3.0):
    """Detect and remove outliers using the Tukey IQR method."""
    x = data[feature]
    q1, q3 = x.quantile(0.25), x.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - k * iqr
    upper_bound = q3 + k * iqr
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

# Create a cleaned copy of the dataset
data_cleaner = n_hotels.copy()

# Loop through numeric columns
for feature in num_cols:
    outliers, cleaned = outliers_iqr(data_cleaner, feature, k=3)
    print(f"{feature}: removed {len(outliers)} outliers")
    data_cleaner = cleaned  # progressively clean the dataset

In [None]:
plt.figure(figsize=(16, 20))
for i, col in enumerate(num_cols, 1):
    plt.subplot(len(num_cols)//3 + 1, 3, i)
    sns.boxplot(x=data_cleaner[col])
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(len(num_cols), 2, figsize=(12, 4 * len(num_cols)))

for i, col in enumerate(num_cols):
    # Before cleaning
    axes[i, 0].hist(n_hotels[col], bins=50, color='skyblue', alpha=0.7)
    axes[i, 0].set_title(f'{col} — Before Cleaning')

    # After cleaning
    axes[i, 1].hist(data_cleaner[col], bins=50, color='lightgreen', alpha=0.7)
    axes[i, 1].set_title(f'{col} — After Cleaning')

plt.tight_layout()
plt.show()

In [None]:
# Create a cleaned copy of the dataset
n_hotels_cleaner = n_hotels.copy()

# It seems like, some of the features do require outlier cleaning
# --- Apply cleaning only to selected columns ---
out_cols = [
    'total_number_of_reviews',
    'review_total_negative_word_counts',
    'review_total_positive_word_counts',
    'total_number_of_reviews_reviewer_has_given',
    'nights'
]

for feature in out_cols:
    outliers, cleaned = outliers_iqr(n_hotels, feature, k=3)
    print(f"{feature}: removed {len(outliers)} outliers")
    n_hotels_cleaner = cleaned  # progressively remove outliers

## Scaling

For numeric features that vary a lot in magnitude and will otherwise dominate distance-based or linear models.

**Tree-based models** (RandomForest, XGBoost, etc.) are scale-invariant, so scaling is not needed.
However, if you’ll use regression, PCA, or neural networks, scaling is highly recommended.


In [None]:
'''
to_scale = [
    'additional_number_of_scoring',
    'review_total_negative_word_counts',
    'review_total_positive_word_counts',
    'total_number_of_reviews',
    'total_number_of_reviews_reviewer_has_given',
    'nights',
    'days_since_review_n'
]
no_scale = [
    'average_score',
    'reviewer_score',
    'lat', 'lng',
    'lat_nationality', 'lng_nationality',
    'year'
]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
n_hotels[to_scale] = scaler.fit_transform(n_hotels[to_scale])
'''

In [None]:
'''

import numpy as np

def cap_outliers(df, columns, factor=1.5):
    df = df.copy()
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - factor * IQR
        upper = Q3 + factor * IQR
        # cap values
        df[col] = np.where(df[col] < lower, lower,
                           np.where(df[col] > upper, upper, df[col]))
    return df

n_hotels[num_cols] = cap_outliers(n_hotels, num_cols)

'''

## Create interaction features

In [None]:
# Weighted average influence

n_hotels['avg_score_times_total_reviews'] = n_hotels['average_score'] * n_hotels['total_number_of_reviews']
n_hotels['avg_score_times_additional_scoring'] = n_hotels['average_score'] * n_hotels['additional_number_of_scoring']

# ratio
n_hotels['positive_to_negative_ratio'] = (n_hotels['review_total_positive_word_counts'] + 1) / \
                                        (n_hotels['review_total_negative_word_counts'] + 1)


## Correlation map

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Compute correlation only for numerical columns
corr = n_hotels.select_dtypes(include=['number']).corr()

# Create a better visual
plt.figure(figsize=(16,10))
sns.heatmap(
    corr, 
    cmap='coolwarm',       # better contrast
    center=0,              # zero-centered colors
    annot=False,           # can set True for small matrices
    linewidths=0.5,
    cbar_kws={'shrink': .7}
)
plt.title("Correlation Heatmap of Numerical Features", fontsize=16)
plt.show()

In [None]:
# Compute correlations
corr = n_hotels.corr(numeric_only=True)

# Drop 'sample' and the target itself
feature_x='reviewer_score'
target_corr = corr[feature_x].drop([feature_x, 'sample'], errors='ignore')

# Sort correlations by absolute value (optional — helps visualize strongest relationships)
target_corr = target_corr.sort_values(ascending=False)

# Display values
print(target_corr)

# Plot
plt.figure(figsize=(8, 10))
sns.barplot(y=target_corr.index, x=target_corr.values, palette='viridis')
plt.title("Correlation with Reviewer Score (excluding 'sample' and target)")
plt.xlabel("Correlation")
plt.ylabel("Feature")
plt.show()


In [None]:
'''
#Scatter plots for relationships

plt.figure(figsize=(12,6))
sns.scatterplot(x='nights', y='reviewer_score', data=n_hotels, alpha=0.3)
plt.title('Reviewer Score vs Nights Stayed')
plt.show()
'''

# Train/Test Split and Model Training

In [None]:
data_chosen = n_hotels.copy()
#data_chosen = n_hotels_cleaner.copy()

In [None]:
data_chosen.info()

In [None]:
# =====================================================
# 10. Train/Test Split, Encoding, and Model Training
# =====================================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import pandas as pd

# Separate train and test data
train_data = data_chosen.query('sample == 1').drop(['sample'], axis=1)
test_data = data_chosen.query('sample == 0').drop(['sample'], axis=1)

# Target and features
y = train_data['reviewer_score']
X = train_data.drop(['reviewer_score'], axis=1)

In [None]:
'''
# ==========================
# Encode categorical columns using One-Hot Encoding
# ==========================
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Apply the same encoding to the test set
X_test_raw = test_data.drop(['reviewer_score'], axis=1)
X_test_encoded = pd.get_dummies(X_test_raw, columns=categorical_cols, drop_first=True)

# Ensure train and test have the same columns (fill missing columns with 0)
X_test_encoded = X_test_encoded.reindex(columns=X_encoded.columns, fill_value=0)
'''

In [None]:
# ==========================
# Split train data into train/test sets
# ==========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================
# Optional: Scale numeric features (not required for Random Forest)
# ==========================
# scaler = MinMaxScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_valid_scaled = scaler.transform(X_valid)



#  Hyperparameter tuning

In [None]:
# ==========================
# Train Random Forest Model
# ==========================
#rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
#rf.fit(X_train, y_train)

from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100], #, 200, 500
    'max_depth': [30], #None, 10, 20, 30
    'min_samples_split': [2],#, 5, 10
    #'min_samples_leaf': [ 4], #1, 2,
    #'max_features': ['auto']#, 'sqrt'
}

rf = RandomForestRegressor(random_state=42)
search = RandomizedSearchCV(rf, param_distributions=param_grid,
                            n_iter=20, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
search.fit(X_train, y_train)

best_rf = search.best_estimator_

In [None]:
best_rf = search.best_estimator_

In [None]:
# ==========================
# Evaluate model
# ==========================
y_pred = best_rf.predict(X_valid)
mape = metrics.mean_absolute_percentage_error(y_valid, y_pred) * 100
mae = metrics.mean_absolute_error(y_valid, y_pred)
rmse = metrics.mean_squared_error(y_valid, y_pred, squared=False)
r2 = metrics.r2_score(y_valid, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")
print(f"Test MAPE: {mape:.2f}%")
#MAE: 0.86
#RMSE: 1.15
#R²: 0.500
#Test MAPE: 12.39%

In [None]:
print(best_rf.get_params()['n_estimators'])
print(best_rf.get_params()['max_depth'])
print(best_rf.get_params()['min_samples_split'])
print(best_rf.get_params()['min_samples_leaf'])
print(best_rf.get_params()['max_features'])

# Visual inspection

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(y_valid, y_pred, alpha=0.3)
plt.plot([y_valid.min(), y_valid.max()], [y_valid.min(), y_valid.max()], 'r--')  # ideal line
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Predicted vs Actual")
plt.show()

## (Optional) Feature Importance Plot

In [None]:
# =====================================================
# 12. Feature Importance Visualization
# =====================================================
importances = pd.Series(best_rf.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=importances.head(15), y=importances.head(15).index)
plt.title('Top 15 Important Features - Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()