In [65]:
from pathlib import Path
import numpy as np
import pandas as pd

In [66]:
print(pd.__version__)

2.3.2


#### Columns and reading data

In [67]:
cols = ['id', 'last_scraped', 'host_id', 'host_since',  'host_is_superhost', 
    'host_listings_count', 'host_total_listings_count', 
    'latitude', 'longitude', 
    'property_type', 'room_type', 'accommodates', 
    'price', 'minimum_nights', 'maximum_nights', 
    'minimum_minimum_nights','maximum_minimum_nights',
    'minimum_maximum_nights','maximum_maximum_nights',
    'minimum_nights_avg_ntm','maximum_nights_avg_ntm',
    'availability_365', 'number_of_reviews', 
    'first_review', 'last_review', 'review_scores_rating', 
    'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms',
    'reviews_per_month']

In [68]:
# Set download URL
ymd  = '20250615'
city = 'London'
host = 'https://orca.casa.ucl.ac.uk'
url  = f'{host}/~jreades/data/{ymd}-{city}-listings.csv.gz'

In [69]:
# your code here
df = pd.read_csv(url, compression='gzip', low_memory=False, usecols=cols)
print(f"Data frame is {df.shape[0]:,} x {df.shape[1]}")

Data frame is 96,651 x 31


#### Removing rows with >5 NA values

In [70]:
probs = df.isnull().sum(axis=1)
df.isnull().sum(axis=1).sort_values(ascending=False).head(10)

5155     8
3646     8
4622     8
30641    8
5535     8
6122     8
58822    7
87970    7
86012    7
89683    7
dtype: int64

In [71]:
print(f"df contains {df.shape[0]:,} rows.")
cutoff = 5
df.drop(probs[probs > cutoff].index, inplace=True)
print(f"df contains {df.shape[0]:,} rows.")

df contains 96,651 rows.
df contains 96,563 rows.


In [72]:
df.isnull().sum(axis=0).sort_values(ascending=False) 

price                                           33885
reviews_per_month                               25076
first_review                                    25076
review_scores_rating                            25076
last_review                                     25076
host_is_superhost                                1747
host_since                                         26
host_listings_count                                26
host_total_listings_count                          26
maximum_nights_avg_ntm                              0
availability_365                                    0
number_of_reviews                                   0
calculated_host_listings_count                      0
maximum_maximum_nights                              0
calculated_host_listings_count_entire_homes         0
calculated_host_listings_count_private_rooms        0
calculated_host_listings_count_shared_rooms         0
minimum_nights_avg_ntm                              0
id                          

#### Dealing with column types

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96563 entries, 0 to 96650
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            96563 non-null  int64  
 1   last_scraped                                  96563 non-null  object 
 2   host_id                                       96563 non-null  int64  
 3   host_since                                    96537 non-null  object 
 4   host_is_superhost                             94816 non-null  object 
 5   host_listings_count                           96537 non-null  float64
 6   host_total_listings_count                     96537 non-null  float64
 7   latitude                                      96563 non-null  float64
 8   longitude                                     96563 non-null  float64
 9   property_type                                 96563 non-null  obje

In [74]:
# boolean values
bools = ['host_is_superhost']
for b in bools:
    print(f"Converting {b} to boolean")
    df[b] = df[b].replace({'f':False, 't':True}).astype('bool')

# categories
cats = ['property_type','room_type']
for c in cats:
    print(f"Converting {c} to categort")
    df[c] = df[c].astype('category')

# dates
dates = ['host_since','first_review','last_review']
for d in dates:
    print(f"Converting {d} to date")
    df[d] = pd.to_datetime(df[d])

# strings
money = ['price']
for m in money:
    print(f"Converting {m} to float")
    df[m] = (
        df[m]
        .astype(str)            # convert to string so .str works
        .str.replace('$', '', regex=False)
        .str.replace(',', '')
        .replace('nan', float('nan'))  # optional: fix 'nan' strings
        .astype(float))

# integers
ints  = ['id','host_id','host_listings_count','host_total_listings_count','accommodates', 'minimum_nights','maximum_nights','availability_365']
for i in ints:
    print(f"Converting {i} to integer")
    try:
        df[i] = df[i].astype('float').astype('int')
    except ValueError as e:
        print("  - !!!Converting to unsigned 16-bit integer!!!")
        df[i] = df[i].astype('float').astype(pd.UInt16Dtype())


Converting host_is_superhost to boolean
Converting property_type to categort
Converting room_type to categort
Converting host_since to date
Converting first_review to date
Converting last_review to date
Converting price to float
Converting id to integer
Converting host_id to integer
Converting host_listings_count to integer
  - !!!Converting to unsigned 16-bit integer!!!
Converting host_total_listings_count to integer
  - !!!Converting to unsigned 16-bit integer!!!
Converting accommodates to integer
Converting minimum_nights to integer
Converting maximum_nights to integer
Converting availability_365 to integer


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96563 entries, 0 to 96650
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   id                                            96563 non-null  int64         
 1   last_scraped                                  96563 non-null  object        
 2   host_id                                       96563 non-null  int64         
 3   host_since                                    96537 non-null  datetime64[ns]
 4   host_is_superhost                             96563 non-null  bool          
 5   host_listings_count                           96537 non-null  UInt16        
 6   host_total_listings_count                     96537 non-null  UInt16        
 7   latitude                                      96563 non-null  float64       
 8   longitude                                     96563 non-null  float64  