In [1]:
import pandas as pd
import numpy as np
import seaborn as sn

from matplotlib import pyplot as plt
from io import StringIO
%matplotlib inline

In [2]:
!python --version

Python 3.10.19


In [3]:
import os
import ssl
import requests

from tqdm import tqdm
from PIL import Image
from io import BytesIO
from urllib import request

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('future.no_silent_downcasting', True)

In [5]:
df_listings_details = pd.read_csv('../data/listings_detailed.csv')
df_listings = pd.read_csv('../data/listings.csv')

In [6]:
selected_columns = ['id','room_type', 'minimum_nights', 'neighbourhood',
   'availability_eoy', 'availability_365', 'picture_url',
    'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_identity_verified',
    'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'estimated_occupancy_l365d', 'estimated_revenue_l365d',
    'number_of_reviews', 'number_of_reviews_l30d', 'reviews_per_month',
    'review_scores_rating', 'review_scores_value',
    'instant_bookable', 'calculated_host_listings_count', 'price']


In [7]:
def normalize_tf_cols(df, column):
    df[column] = df[column].replace({'t': 1, 'f': 0}).astype(bool)
    return df

In [8]:
def fix_encoding(df_cleaned):
    encoding_map = {}
    for val in df_cleaned['neighbourhood'].unique():
        try:
            clean_val = val.encode("latin1").decode("utf-8", errors="ignore")
            encoding_map[val] = clean_val
        except (UnicodeEncodeError, AttributeError):
            encoding_map[val] = val
    df_cleaned['neighbourhood'] = df_cleaned['neighbourhood'].map(encoding_map)
    return df_cleaned

In [9]:
def data_cleanup(df_1, df_2):
    df_merged_listings = pd.concat([df_listings, df_listings_details], axis=1)
    df_merged_listings = df_merged_listings.loc[:, ~df_merged_listings.columns.duplicated()]
    df_cleaned = df_merged_listings[selected_columns].dropna()
    df_cleaned = df_cleaned[df_cleaned['availability_eoy']> 0]
    df_cleaned = df_cleaned[df_cleaned['availability_365']> 0]
    df_cleaned = df_cleaned[df_cleaned['estimated_occupancy_l365d']> 0]
    df_cleaned['host_response_rate'] = df_cleaned['host_response_rate'].str.replace('%', '', regex=False).astype(float)
    df_cleaned['host_acceptance_rate'] = df_cleaned['host_acceptance_rate'].str.replace('%', '', regex=False).astype(float)
    df_cleaned = normalize_tf_cols(df_cleaned, 'instant_bookable')
    df_cleaned = normalize_tf_cols(df_cleaned, 'host_identity_verified')
    df_cleaned = normalize_tf_cols(df_cleaned, 'host_is_superhost')
    df_cleaned = fix_encoding(df_cleaned)
    df_cleaned.columns = df_cleaned.columns.str.replace('/','_')
    df_cleaned.columns = df_cleaned.columns.str.lower()
    df_cleaned.columns = df_cleaned.columns.str.replace(' ','_')
    return df_cleaned

In [10]:
df_cleaned = data_cleanup(df_listings, df_listings_details)
df_cleaned = df_cleaned.reset_index(drop=True)

In [11]:
def identify_premium_properties(df, threshold=0.5):
    neighborhood_premium_stats = df_cleaned.groupby('neighbourhood').agg({
        'id':'count',
        'price':  lambda x: x.quantile(threshold),
        'review_scores_value': lambda x: x.quantile(threshold)
    })
    neighborhood_premium_stats = neighborhood_premium_stats.rename(
    columns={
        'price': 'price_q_threshold',
        'review_scores_value': 'rating_q_threshold'
    })
    neighborhood_premium_stats = neighborhood_premium_stats.reset_index()
    df_premium = df_cleaned.merge(
        neighborhood_premium_stats[['neighbourhood', 'price_q_threshold', 'rating_q_threshold']],
        on='neighbourhood',
        how='left'
    )
    df_premium['is_premium'] = (
            (df_premium['price'] >= df_premium['price_q_threshold']) &
            (df_premium['review_scores_value'] >= df_premium['rating_q_threshold'])
        )
    df_premium['is_premium'] = df_premium['is_premium'].astype(int)
    return df_premium

In [12]:
df_premium = identify_premium_properties(df_cleaned, 0.5)

In [13]:
df_premium[['id', 'picture_url']].head(2)

Unnamed: 0,id,picture_url
0,51287,https://a0.muscache.com/pictures/25163038/1c4e...
1,169672,https://a0.muscache.com/pictures/c1a1e093-66da...


In [14]:
def download_image_from_url(url):
    context = ssl._create_unverified_context()
    try:
        with request.urlopen(url, context=context) as resp:
            buffer = resp.read()
        stream = BytesIO(buffer)
        img = Image.open(stream)
        return img
    except:
        print("Image Not found exception")
        return None
    print("Image Not found")
    return None

In [15]:
def prepare_image(img, target_size):
    if img.mode != 'RGB':
        img = img.convert('RGB')
    img = img.resize(target_size, Image.NEAREST)
    return img

In [None]:
sample_df = df_premium.sample(100, random_state=42)

for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    url = row['picture_url']
    listing_id = row['id']

    if pd.isna(url):
        continue

    if row['is_premium'] == 1:
        folder = images_dir + "/premium"
    else:
        folder = images_dir + "/non_premium"

    save_path = f"{folder}/{listing_id}.jpg"
    print(save_path)
    img = download_image_from_url(url)

    if img is None:
        print(img)
        continue;

    resized_img =  prepare_image(img, (300,400))
    resized_img.save(save_path)

In [None]:
!pip install tensorflow-macos tensorflow-metal

In [16]:
!pip list | grep numpy

numpy                        1.26.4


In [17]:
import tensorflow as tf
from tensorflow.keras import layers

  if not hasattr(np, "object"):


In [19]:
images_dir = '../images'

In [20]:
img_size = (224, 224)
batch_size = 10

dataset = tf.keras.preprocessing.image_dataset_from_directory(
    images_dir,
    image_size=img_size,
    batch_size=batch_size,
    label_mode="binary"
)

Found 99 files belonging to 2 classes.


2026-01-11 16:11:34.330672: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2026-01-11 16:11:34.330852: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2026-01-11 16:11:34.330872: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2026-01-11 16:11:34.331130: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2026-01-11 16:11:34.331157: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [21]:
train_ds = dataset.take(int(len(dataset)*0.8))
val_ds   = dataset.skip(int(len(dataset)*0.8))
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
val_ds   = val_ds.prefetch(tf.data.AUTOTUNE)
len(dataset), len(train_ds), len(val_ds)

(10, 8, 2)

In [22]:
augment = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.05),
    layers.RandomZoom(0.1)
])

In [23]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import models

base = EfficientNetB0(
    include_top=False,
    input_shape=img_size + (3,),
    weights="imagenet"
)

base.trainable = False


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5


Exception: URL fetch failure on https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5: None -- [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)

In [None]:
inputs = layers.Input(shape=img_size + (3,))
x = augment(inputs)
x = tf.keras.applications.efficientnet.preprocess_input(x)
x = base(x, training=False)
x = layers.GlobalAveragePooling2D()(x)

x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.4)(x)

outputs = layers.Dense(1, activation="sigmoid")(x)

model = models.Model(inputs, outputs)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=[0]* 75 + [1]* 24
)

class_weights = {0: class_weights[0], 1: class_weights[1]}
class_weights

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=8,
    class_weight=class_weights
)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

y_true = np.concatenate([y for _, y in val_ds], axis=0)
y_pred = model.predict(val_ds).ravel()
y_class = (y_pred >= 0.46).astype(int)
print(confusion_matrix(y_true, y_class))
print(classification_report(y_true, y_class))

In [None]:
model.save(output_dir + "cnn_premium_detector.keras")

In [None]:
sample_df = df_premium.sample(100, random_state=30)

for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    url = row['picture_url']
    listing_id = row['id']

    if pd.isna(url):
        continue

    if row['is_premium'] == 1:
        folder = images_dir + "/test/premium"
    else:
        folder = images_dir + "/test/non_premium"

    save_path = f"{folder}/{listing_id}.jpg"
    print(save_path)
    img = download_image_from_url(url)

    if img is None:
        print(img)
        continue;

    resized_img =  prepare_image(img, (300,400))
    resized_img.save(save_path)

In [None]:
def predict_image(path):
    img = Image.open(path).convert("RGB")
    img = img.resize(img_size)

    arr = np.array(img) / 255.0
    arr = np.expand_dims(arr, axis=0)

    prob = model.predict(arr)[0][0]

    print("Path:", path)
    print("Premium probability:", prob)

    if prob >= 0.6:
        print("Prediction → PREMIUM")
        return True
    else:
        print("Prediction → NON-PREMIUM")
        return False

In [None]:
test_dir = '/content/drive/MyDrive/colab_nbs/airbnb/test'
test_folder = test_dir + "/images"
non_premium_count = 0
premium_count = 0
for filename in os.listdir(test_folder):
  full_path = os.path.join(test_folder, filename)
  result = predict_image(full_path)
  if(result is True):
    premium_count = premium_count + 1
  else:
    non_premium_count = non_premium_count + 1
print("Total Premium", premium_count )
print("Total Non-Premium", non_premium_count )