In [1]:
import pandas as pd
import json
import numpy as np
import seaborn as sn
import pickle

from matplotlib import pyplot as plt
from io import StringIO
%matplotlib inline

In [2]:
import os
import ssl
import requests

from tqdm import tqdm
from PIL import Image
from io import BytesIO
from urllib import request

In [3]:
df_listings_details = pd.read_csv('../data/listings_detailed.csv')
df_listings = pd.read_csv('../data/listings.csv')

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('future.no_silent_downcasting', True)

In [5]:
selected_columns = ['id','room_type', 'minimum_nights', 'neighbourhood',
   'availability_eoy', 'availability_365', 'picture_url',
    'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_identity_verified',
    'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'estimated_occupancy_l365d', 'estimated_revenue_l365d',
    'number_of_reviews', 'number_of_reviews_l30d', 'reviews_per_month', 
    'review_scores_rating', 'review_scores_value', 
    'instant_bookable', 'calculated_host_listings_count', 'price']


In [6]:
def normalize_tf_cols(df, column):
    df[column] = df[column].replace({'t': 1, 'f': 0}).astype(bool)
    return df

In [7]:
def fix_encoding(df_cleaned):
    encoding_map = {}
    for val in df_cleaned['neighbourhood'].unique():
        try:
            clean_val = val.encode("latin1").decode("utf-8", errors="ignore")
            encoding_map[val] = clean_val
        except (UnicodeEncodeError, AttributeError):
            encoding_map[val] = val
    df_cleaned['neighbourhood'] = df_cleaned['neighbourhood'].map(encoding_map)
    return df_cleaned

In [8]:
def data_cleanup(df_1, df_2):
    df_merged_listings = pd.concat([df_listings, df_listings_details], axis=1)
    df_merged_listings = df_merged_listings.loc[:, ~df_merged_listings.columns.duplicated()]
    df_cleaned = df_merged_listings[selected_columns].dropna() 
    df_cleaned = df_cleaned[df_cleaned['availability_eoy']> 0]
    df_cleaned = df_cleaned[df_cleaned['availability_365']> 0]
    df_cleaned = df_cleaned[df_cleaned['estimated_occupancy_l365d']> 0]
    df_cleaned['host_response_rate'] = df_cleaned['host_response_rate'].str.replace('%', '', regex=False).astype(float)
    df_cleaned['host_acceptance_rate'] = df_cleaned['host_acceptance_rate'].str.replace('%', '', regex=False).astype(float)
    df_cleaned = normalize_tf_cols(df_cleaned, 'instant_bookable')
    df_cleaned = normalize_tf_cols(df_cleaned, 'host_identity_verified')
    df_cleaned = normalize_tf_cols(df_cleaned, 'host_is_superhost')
    df_cleaned = fix_encoding(df_cleaned)
    df_cleaned.columns = df_cleaned.columns.str.replace('/','_')
    df_cleaned.columns = df_cleaned.columns.str.lower()
    df_cleaned.columns = df_cleaned.columns.str.replace(' ','_')
    return df_cleaned

In [9]:
df_cleaned = data_cleanup(df_listings, df_listings_details)

In [10]:
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned.head(5)

Unnamed: 0,id,room_type,minimum_nights,neighbourhood,availability_eoy,availability_365,picture_url,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,accommodates,bathrooms,bedrooms,beds,estimated_occupancy_l365d,estimated_revenue_l365d,number_of_reviews,number_of_reviews_l30d,reviews_per_month,review_scores_rating,review_scores_value,instant_bookable,calculated_host_listings_count,price
0,51287,Entire home/apt,30,Leopoldstadt,46,207,https://a0.muscache.com/pictures/25163038/1c4e...,100.0,100.0,True,True,2,1.0,0.0,2.0,180,12780.0,383,0,2.15,4.67,4.59,False,2,71.0
1,169672,Private room,2,Wieden,8,255,https://a0.muscache.com/pictures/c1a1e093-66da...,100.0,69.0,True,True,2,1.0,1.0,1.0,120,6960.0,445,3,2.59,4.77,4.77,False,1,58.0
2,169752,Entire home/apt,1,Margareten,26,251,https://a0.muscache.com/pictures/97156e49-9dbd...,100.0,99.0,True,True,3,1.0,0.0,1.0,54,3132.0,112,2,0.89,4.85,4.75,True,11,58.0
3,171835,Entire home/apt,3,Wieden,4,4,https://a0.muscache.com/pictures/2077532/45431...,100.0,67.0,True,True,3,1.0,2.0,2.0,72,5832.0,195,1,1.15,4.68,4.73,False,1,81.0
4,172530,Entire home/apt,1,Margareten,9,171,https://a0.muscache.com/pictures/1134938/f9bec...,100.0,99.0,True,True,7,1.5,3.0,4.0,30,4140.0,243,0,1.42,4.78,4.63,True,11,138.0


In [11]:
def identify_premium_properties(df, threshold=0.5):
    neighborhood_premium_stats = df_cleaned.groupby('neighbourhood').agg({
        'id':'count',
        'price':  lambda x: x.quantile(threshold),
        'review_scores_value': lambda x: x.quantile(threshold)
    })
    neighborhood_premium_stats = neighborhood_premium_stats.rename(
    columns={
        'price': 'price_q_threshold',
        'review_scores_value': 'rating_q_threshold'
    })
    neighborhood_premium_stats = neighborhood_premium_stats.reset_index()
    df_premium = df_cleaned.merge(
        neighborhood_premium_stats[['neighbourhood', 'price_q_threshold', 'rating_q_threshold']],
        on='neighbourhood',
        how='left'
    )
    df_premium['is_premium'] = (
            (df_premium['price'] >= df_premium['price_q_threshold']) &
            (df_premium['review_scores_value'] >= df_premium['rating_q_threshold'])
        )
    df_premium['is_premium'] = df_premium['is_premium'].astype(int)
    return df_premium

In [12]:
df_premium = identify_premium_properties(df_cleaned, 0.5)

In [13]:
len(df_cleaned),len(df_premium),len(df_premium[df_premium['is_premium'] == 1]), len(df_premium[df_premium['is_premium'] == 0])

(6794, 6794, 1907, 4887)

In [14]:
df_premium[['id', 'picture_url']].head(2)

Unnamed: 0,id,picture_url
0,51287,https://a0.muscache.com/pictures/25163038/1c4e...
1,169672,https://a0.muscache.com/pictures/c1a1e093-66da...


In [15]:
def download_image(url, save_path):
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            with open(save_path, "wb") as f:
                f.write(r.content)
            print("Image found")
            return True
    except:
        print("Image Not found exception")
        pass
    print("Image Not found")
    return False

In [16]:
def download_image_from_url(url):
    context = ssl._create_unverified_context()
    try:
        with request.urlopen(url, context=context) as resp:
            buffer = resp.read()
        stream = BytesIO(buffer)
        img = Image.open(stream)
        return img
    except:
        print("Image Not found exception")
        return None
    print("Image Not found")
    return None

In [17]:
def prepare_image(img, target_size):
    if img.mode != 'RGB':
        img = img.convert('RGB')
    img = img.resize(target_size, Image.NEAREST)
    return img

In [41]:
sample_df = df_premium.sample(100, random_state=42)

for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    url = row['picture_url']
    listing_id = row['id']
    
    if pd.isna(url):
        continue
    
    if row['is_premium'] == 1:
        folder = "../images/premium"
    else:
        folder = "../images/non_premium"
    
    save_path = f"{folder}/{listing_id}.jpg"
    print(save_path)
    img = download_image_from_url(url)

    if img is None:
        print(img)
        continue;
        
    resized_img =  prepare_image(img, (300,400))
    resized_img.save(save_path) 

  0%|                                                                                                     | 0/100 [00:00<?, ?it/s]

../images/non_premium/964206565905955937.jpg


  1%|▉                                                                                            | 1/100 [00:00<00:25,  3.95it/s]

../images/non_premium/1102298634083944389.jpg
../images/non_premium/1412226864397489130.jpg


  3%|██▊                                                                                          | 3/100 [00:01<00:35,  2.73it/s]

../images/premium/977893173421197851.jpg
../images/non_premium/1468832232918817916.jpg


  5%|████▋                                                                                        | 5/100 [00:01<00:22,  4.14it/s]

../images/non_premium/1370861391980058217.jpg


  6%|█████▌                                                                                       | 6/100 [00:01<00:21,  4.29it/s]

../images/non_premium/956488003103130677.jpg
../images/non_premium/33427950.jpg


 10%|█████████▏                                                                                  | 10/100 [00:01<00:13,  6.71it/s]

../images/premium/1236285666417455881.jpg
../images/non_premium/2023633.jpg
../images/non_premium/1589958.jpg


 13%|███████████▉                                                                                | 13/100 [00:02<00:09,  9.35it/s]

../images/premium/913409543080789024.jpg
../images/non_premium/1435794393346098115.jpg
../images/premium/42038043.jpg
../images/non_premium/15871850.jpg


 15%|█████████████▊                                                                              | 15/100 [00:02<00:10,  8.30it/s]

../images/premium/881993291204129160.jpg
../images/premium/1192676874504918556.jpg


 17%|███████████████▋                                                                            | 17/100 [00:03<00:14,  5.58it/s]

../images/non_premium/558627407177666539.jpg


 20%|██████████████████▍                                                                         | 20/100 [00:03<00:12,  6.24it/s]

../images/non_premium/956602313599110587.jpg
../images/non_premium/1434686995627172832.jpg
../images/non_premium/1175375794383673365.jpg


 21%|███████████████████▎                                                                        | 21/100 [00:03<00:11,  6.63it/s]

../images/non_premium/1486832333517933332.jpg
../images/premium/1321780728981744999.jpg


 24%|██████████████████████                                                                      | 24/100 [00:04<00:11,  6.57it/s]

../images/non_premium/999771241757947525.jpg
../images/premium/13465011.jpg


 27%|████████████████████████▊                                                                   | 27/100 [00:04<00:12,  5.96it/s]

../images/premium/16901546.jpg
../images/non_premium/911320357783783296.jpg
../images/non_premium/1051980956818299849.jpg
../images/non_premium/1221691566248375820.jpg


 29%|██████████████████████████▋                                                                 | 29/100 [00:04<00:10,  6.59it/s]

../images/non_premium/20968137.jpg


 32%|█████████████████████████████▍                                                              | 32/100 [00:05<00:08,  7.57it/s]

../images/premium/1429498467503360018.jpg
../images/premium/731729250804895177.jpg
../images/non_premium/1445553358216373740.jpg


 34%|███████████████████████████████▎                                                            | 34/100 [00:05<00:07,  9.33it/s]

../images/non_premium/1276766742662315375.jpg
../images/non_premium/1475303776582160471.jpg
../images/premium/984685022149395272.jpg


 36%|█████████████████████████████████                                                           | 36/100 [00:05<00:07,  8.67it/s]

../images/non_premium/39502609.jpg


 38%|██████████████████████████████████▉                                                         | 38/100 [00:06<00:09,  6.75it/s]

../images/non_premium/1365898285951274265.jpg
../images/premium/1477512720964383197.jpg
../images/non_premium/927656533300967570.jpg


 40%|████████████████████████████████████▊                                                       | 40/100 [00:07<00:15,  3.95it/s]

../images/non_premium/1305125389268437369.jpg
../images/non_premium/1257354199535563423.jpg


 43%|███████████████████████████████████████▌                                                    | 43/100 [00:07<00:11,  4.87it/s]

../images/non_premium/1269430943941687172.jpg
../images/non_premium/52429687.jpg


 44%|████████████████████████████████████████▍                                                   | 44/100 [00:08<00:16,  3.39it/s]

../images/non_premium/891432288580421628.jpg


 45%|█████████████████████████████████████████▍                                                  | 45/100 [00:08<00:16,  3.41it/s]

../images/non_premium/588963352504958123.jpg


 47%|███████████████████████████████████████████▏                                                | 47/100 [00:09<00:15,  3.47it/s]

../images/non_premium/42187036.jpg
../images/non_premium/23162465.jpg


 49%|█████████████████████████████████████████████                                               | 49/100 [00:09<00:12,  4.00it/s]

../images/non_premium/1399940204075102187.jpg
../images/non_premium/53804731.jpg


 50%|██████████████████████████████████████████████                                              | 50/100 [00:10<00:20,  2.48it/s]

../images/premium/41005680.jpg


 52%|███████████████████████████████████████████████▊                                            | 52/100 [00:11<00:18,  2.64it/s]

../images/premium/1377698155278292914.jpg
../images/non_premium/1444796147318067271.jpg
../images/non_premium/1472930121083833371.jpg


 55%|██████████████████████████████████████████████████▌                                         | 55/100 [00:12<00:17,  2.53it/s]

Image Not found exception
None
../images/non_premium/15533110.jpg
../images/non_premium/659327138159218759.jpg


 58%|█████████████████████████████████████████████████████▎                                      | 58/100 [00:12<00:09,  4.52it/s]

../images/non_premium/17707210.jpg
../images/non_premium/39907655.jpg
../images/premium/1249060900412409247.jpg


 59%|██████████████████████████████████████████████████████▎                                     | 59/100 [00:13<00:09,  4.22it/s]

../images/non_premium/1047662301759564004.jpg


 61%|████████████████████████████████████████████████████████                                    | 61/100 [00:13<00:11,  3.46it/s]

../images/non_premium/41933968.jpg
../images/non_premium/1375894675730588394.jpg


 64%|██████████████████████████████████████████████████████████▉                                 | 64/100 [00:14<00:07,  4.69it/s]

../images/non_premium/30721228.jpg
../images/premium/796347626660284330.jpg
../images/non_premium/1263574939309463535.jpg


 66%|████████████████████████████████████████████████████████████▋                               | 66/100 [00:14<00:06,  5.57it/s]

../images/non_premium/681169225057555523.jpg
../images/non_premium/1249051997028257738.jpg


 68%|██████████████████████████████████████████████████████████████▌                             | 68/100 [00:15<00:06,  5.32it/s]

../images/non_premium/50389214.jpg
../images/non_premium/13402108.jpg


 71%|█████████████████████████████████████████████████████████████████▎                          | 71/100 [00:15<00:04,  6.26it/s]

../images/non_premium/1478971935116628091.jpg
../images/non_premium/618653373034224795.jpg
../images/non_premium/3223934.jpg


 73%|███████████████████████████████████████████████████████████████████▏                        | 73/100 [00:15<00:04,  6.43it/s]

../images/non_premium/1443322399354085701.jpg
../images/non_premium/669898.jpg


 74%|████████████████████████████████████████████████████████████████████                        | 74/100 [00:16<00:05,  4.54it/s]

../images/non_premium/923694547962125347.jpg


 75%|█████████████████████████████████████████████████████████████████████                       | 75/100 [00:16<00:05,  4.64it/s]

../images/non_premium/1127540310652873661.jpg


 76%|█████████████████████████████████████████████████████████████████████▉                      | 76/100 [00:16<00:06,  3.84it/s]

../images/non_premium/1032760738873474382.jpg


 79%|████████████████████████████████████████████████████████████████████████▋                   | 79/100 [00:17<00:03,  5.69it/s]

../images/non_premium/1062575976373188289.jpg
../images/non_premium/965282187428738008.jpg
../images/non_premium/1405219418750760436.jpg


 80%|█████████████████████████████████████████████████████████████████████████▌                  | 80/100 [00:17<00:03,  5.36it/s]

../images/premium/1384762468722998035.jpg
../images/non_premium/39250647.jpg


 82%|███████████████████████████████████████████████████████████████████████████▍                | 82/100 [00:17<00:03,  4.88it/s]

../images/premium/1421841945271739538.jpg


 84%|█████████████████████████████████████████████████████████████████████████████▎              | 84/100 [00:18<00:03,  4.19it/s]

../images/premium/918514261372412148.jpg
../images/non_premium/23206608.jpg
../images/premium/1161458653209749621.jpg


 88%|████████████████████████████████████████████████████████████████████████████████▉           | 88/100 [00:19<00:01,  6.14it/s]

../images/non_premium/41903558.jpg
../images/non_premium/1162773461272327022.jpg
../images/non_premium/791418807869273736.jpg
../images/non_premium/1300144347861996852.jpg


 90%|██████████████████████████████████████████████████████████████████████████████████▊         | 90/100 [00:19<00:01,  6.13it/s]

../images/premium/766512766713762470.jpg
../images/non_premium/42189674.jpg


 93%|█████████████████████████████████████████████████████████████████████████████████████▌      | 93/100 [00:19<00:00,  7.19it/s]

../images/non_premium/652677409364288108.jpg
../images/premium/51206431.jpg


 94%|██████████████████████████████████████████████████████████████████████████████████████▍     | 94/100 [00:20<00:01,  5.08it/s]

../images/non_premium/1293449409898570457.jpg
../images/premium/690888483154865051.jpg


 96%|████████████████████████████████████████████████████████████████████████████████████████▎   | 96/100 [00:20<00:00,  5.76it/s]

../images/non_premium/1389831309311365517.jpg


 99%|███████████████████████████████████████████████████████████████████████████████████████████ | 99/100 [00:20<00:00,  6.36it/s]

../images/non_premium/1424555510816756744.jpg
../images/non_premium/44027879.jpg
../images/non_premium/34876910.jpg


100%|███████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:20<00:00,  4.77it/s]
