In [1]:
pip install linearmodels

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels import RandomEffects
from sklearn.preprocessing import MinMaxScaler

In [3]:
df = pd.read_csv("all_data126.csv", encoding="ISO-8859-1",parse_dates=['last_review'])

In [4]:
df.head()

Unnamed: 0,host_id,id,calculated_host_listings_count,name,minimum_nights,latitude,room_type,price,availability_365,number_of_reviews,longitude,last_review,neighbourhood,reviews_per_month,year
0,49602995,9582415,1,Single/Twin/Double Ensuite near Twickenham Sta...,1,51.44473,Private room,35.0,278,47,-0.379,2019-11-06,Richmond upon Thames,2.27,2019
1,70374572,23013522,3,"Spacious room with double bed for 2, Twickenham",1,51.45867,Private room,19.0,23,131,-0.34444,2019-11-06,Hounslow,6.52,2019
2,25587547,25796711,1,Stunning Central London Apartment Close The River,7,51.47398,Entire home/apt,103.0,317,8,-0.21531,2019-11-06,Hammersmith and Fulham,0.51,2019
3,57607790,29060134,1,Recently renovated fabulous four bedroom house,14,51.52716,Entire home/apt,85.0,346,5,-0.44164,2019-11-06,Hillingdon,0.4,2019
4,228095684,30374086,3,Lovely cosy flat for 4 in Heart of North London,3,51.59118,Entire home/apt,85.0,139,16,-0.1667,2019-11-06,Barnet,1.52,2019


In [5]:
df.dtypes

host_id                                    int64
id                                         int64
calculated_host_listings_count             int64
name                                      object
minimum_nights                             int64
latitude                                 float64
room_type                                 object
price                                    float64
availability_365                           int64
number_of_reviews                          int64
longitude                                float64
last_review                       datetime64[ns]
neighbourhood                             object
reviews_per_month                        float64
year                                       int64
dtype: object

In [6]:
df_filtered = df[df["reviews_per_month"] > 0.5]
#define active offers as those listings which could be booked at least for one night in the following 60 days at the moment of scraping.
#Use the number of reviews to estimate the number of bookings, thereby selecting properties with more than 0.5 reviews per month as effective listings
#The analysis was restricted to active listings throughout the analysis.

In [7]:
top_2_percent = df_filtered["price"].quantile(0.98)
bottom_2_percent = df_filtered["price"].quantile(0.02)
#the average price was calculated for each city and month, based on the nightly price of available listings. 
#To eliminate outlier observations, the top 2% and bottom 2% percentile of offers were not taken into consideration. 

In [8]:
df_filtered1 = df_filtered[
    (df_filtered["price"] >= bottom_2_percent) & (df_filtered["price"] <= top_2_percent)
]

In [9]:
df_filtered1.head()

Unnamed: 0,host_id,id,calculated_host_listings_count,name,minimum_nights,latitude,room_type,price,availability_365,number_of_reviews,longitude,last_review,neighbourhood,reviews_per_month,year
0,49602995,9582415,1,Single/Twin/Double Ensuite near Twickenham Sta...,1,51.44473,Private room,35.0,278,47,-0.379,2019-11-06,Richmond upon Thames,2.27,2019
2,25587547,25796711,1,Stunning Central London Apartment Close The River,7,51.47398,Entire home/apt,103.0,317,8,-0.21531,2019-11-06,Hammersmith and Fulham,0.51,2019
4,228095684,30374086,3,Lovely cosy flat for 4 in Heart of North London,3,51.59118,Entire home/apt,85.0,139,16,-0.1667,2019-11-06,Barnet,1.52,2019
5,234271729,31300930,1,ALPINE GUEST HOUSE. Detached & self-contained,1,51.3796,Private room,32.0,140,50,-0.27561,2019-11-06,Kingston upon Thames,5.17,2019
6,246523174,34733139,2,room in the cottage,2,51.59863,Private room,30.0,157,15,-0.39746,2019-11-06,Harrow,2.69,2019


In [10]:
df_filtered2 = df_filtered1[df_filtered1["availability_365"] > 0]

In [11]:
df_filtered2.head()

Unnamed: 0,host_id,id,calculated_host_listings_count,name,minimum_nights,latitude,room_type,price,availability_365,number_of_reviews,longitude,last_review,neighbourhood,reviews_per_month,year
0,49602995,9582415,1,Single/Twin/Double Ensuite near Twickenham Sta...,1,51.44473,Private room,35.0,278,47,-0.379,2019-11-06,Richmond upon Thames,2.27,2019
2,25587547,25796711,1,Stunning Central London Apartment Close The River,7,51.47398,Entire home/apt,103.0,317,8,-0.21531,2019-11-06,Hammersmith and Fulham,0.51,2019
4,228095684,30374086,3,Lovely cosy flat for 4 in Heart of North London,3,51.59118,Entire home/apt,85.0,139,16,-0.1667,2019-11-06,Barnet,1.52,2019
5,234271729,31300930,1,ALPINE GUEST HOUSE. Detached & self-contained,1,51.3796,Private room,32.0,140,50,-0.27561,2019-11-06,Kingston upon Thames,5.17,2019
6,246523174,34733139,2,room in the cottage,2,51.59863,Private room,30.0,157,15,-0.39746,2019-11-06,Harrow,2.69,2019


In [12]:
df_filtered2.dtypes

host_id                                    int64
id                                         int64
calculated_host_listings_count             int64
name                                      object
minimum_nights                             int64
latitude                                 float64
room_type                                 object
price                                    float64
availability_365                           int64
number_of_reviews                          int64
longitude                                float64
last_review                       datetime64[ns]
neighbourhood                             object
reviews_per_month                        float64
year                                       int64
dtype: object

In [13]:
df_filtered3 = df_filtered2[
    (df_filtered2["last_review"].dt.year >= 2019)
    & (df_filtered2["last_review"].dt.year <= 2023)
]
#Properties that received reviews between 2019 and 2023 are determined to be effective listings.

In [14]:
columns_to_normalize = ["availability_365", "number_of_reviews", "reviews_per_month"]

In [15]:
scaler = MinMaxScaler()# Creating a Normaliser Instance

In [16]:
# Normalise selected columns and create new columns

df_filtered3["availability_365" + "_normalized"] = scaler.fit_transform(
    df_filtered3[["availability_365"]]
)

# Show first few rows of data to validate results
df_filtered3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3["availability_365" + "_normalized"] = scaler.fit_transform(


Unnamed: 0,host_id,id,calculated_host_listings_count,name,minimum_nights,latitude,room_type,price,availability_365,number_of_reviews,longitude,last_review,neighbourhood,reviews_per_month,year,availability_365_normalized
0,49602995,9582415,1,Single/Twin/Double Ensuite near Twickenham Sta...,1,51.44473,Private room,35.0,278,47,-0.379,2019-11-06,Richmond upon Thames,2.27,2019,0.760989
2,25587547,25796711,1,Stunning Central London Apartment Close The River,7,51.47398,Entire home/apt,103.0,317,8,-0.21531,2019-11-06,Hammersmith and Fulham,0.51,2019,0.868132
4,228095684,30374086,3,Lovely cosy flat for 4 in Heart of North London,3,51.59118,Entire home/apt,85.0,139,16,-0.1667,2019-11-06,Barnet,1.52,2019,0.379121
5,234271729,31300930,1,ALPINE GUEST HOUSE. Detached & self-contained,1,51.3796,Private room,32.0,140,50,-0.27561,2019-11-06,Kingston upon Thames,5.17,2019,0.381868
6,246523174,34733139,2,room in the cottage,2,51.59863,Private room,30.0,157,15,-0.39746,2019-11-06,Harrow,2.69,2019,0.428571


In [17]:
df_filtered3["number_of_reviews" + "_normalized"] = scaler.fit_transform(
    df_filtered3[["number_of_reviews"]]
)
df_filtered3["reviews_per_month" + "_normalized"] = scaler.fit_transform(
    df_filtered3[["reviews_per_month"]]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3["number_of_reviews" + "_normalized"] = scaler.fit_transform(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3["reviews_per_month" + "_normalized"] = scaler.fit_transform(


In [18]:
df_filtered3["market_activity"] = (
    df_filtered3["availability_365_normalized"]
    + df_filtered3["number_of_reviews_normalized"]
    + df_filtered3["reviews_per_month_normalized"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3["market_activity"] = (


In [19]:
df_filtered3.head()

Unnamed: 0,host_id,id,calculated_host_listings_count,name,minimum_nights,latitude,room_type,price,availability_365,number_of_reviews,longitude,last_review,neighbourhood,reviews_per_month,year,availability_365_normalized,number_of_reviews_normalized,reviews_per_month_normalized,market_activity
0,49602995,9582415,1,Single/Twin/Double Ensuite near Twickenham Sta...,1,51.44473,Private room,35.0,278,47,-0.379,2019-11-06,Richmond upon Thames,2.27,2019,0.760989,0.030599,0.014943,0.806531
2,25587547,25796711,1,Stunning Central London Apartment Close The River,7,51.47398,Entire home/apt,103.0,317,8,-0.21531,2019-11-06,Hammersmith and Fulham,0.51,2019,0.868132,0.005208,0.0,0.87334
4,228095684,30374086,3,Lovely cosy flat for 4 in Heart of North London,3,51.59118,Entire home/apt,85.0,139,16,-0.1667,2019-11-06,Barnet,1.52,2019,0.379121,0.010417,0.008575,0.398113
5,234271729,31300930,1,ALPINE GUEST HOUSE. Detached & self-contained,1,51.3796,Private room,32.0,140,50,-0.27561,2019-11-06,Kingston upon Thames,5.17,2019,0.381868,0.032552,0.039565,0.453986
6,246523174,34733139,2,room in the cottage,2,51.59863,Private room,30.0,157,15,-0.39746,2019-11-06,Harrow,2.69,2019,0.428571,0.009766,0.018509,0.456846


In [20]:
df_filtered3["year_copy"] = df_filtered3["year"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3["year_copy"] = df_filtered3["year"]


In [21]:
# Assumes COVID period of 2020 and 2021
df_filtered3["covid_period"] = df_filtered3["year"].isin([2020, 2021]).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3["covid_period"] = df_filtered3["year"].isin([2020, 2021]).astype(int)


In [22]:
df_filtered3.head()

Unnamed: 0,host_id,id,calculated_host_listings_count,name,minimum_nights,latitude,room_type,price,availability_365,number_of_reviews,...,last_review,neighbourhood,reviews_per_month,year,availability_365_normalized,number_of_reviews_normalized,reviews_per_month_normalized,market_activity,year_copy,covid_period
0,49602995,9582415,1,Single/Twin/Double Ensuite near Twickenham Sta...,1,51.44473,Private room,35.0,278,47,...,2019-11-06,Richmond upon Thames,2.27,2019,0.760989,0.030599,0.014943,0.806531,2019,0
2,25587547,25796711,1,Stunning Central London Apartment Close The River,7,51.47398,Entire home/apt,103.0,317,8,...,2019-11-06,Hammersmith and Fulham,0.51,2019,0.868132,0.005208,0.0,0.87334,2019,0
4,228095684,30374086,3,Lovely cosy flat for 4 in Heart of North London,3,51.59118,Entire home/apt,85.0,139,16,...,2019-11-06,Barnet,1.52,2019,0.379121,0.010417,0.008575,0.398113,2019,0
5,234271729,31300930,1,ALPINE GUEST HOUSE. Detached & self-contained,1,51.3796,Private room,32.0,140,50,...,2019-11-06,Kingston upon Thames,5.17,2019,0.381868,0.032552,0.039565,0.453986,2019,0
6,246523174,34733139,2,room in the cottage,2,51.59863,Private room,30.0,157,15,...,2019-11-06,Harrow,2.69,2019,0.428571,0.009766,0.018509,0.456846,2019,0


In [23]:
df_filtered3["year_2019"] = (df_filtered3["year"] == 2019).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3["year_2019"] = (df_filtered3["year"] == 2019).astype(int)


In [24]:
df_filtered3["year_2023"] = (df_filtered3["year"] == 2023).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3["year_2023"] = (df_filtered3["year"] == 2023).astype(int)


In [25]:
df_filtered3.head()

Unnamed: 0,host_id,id,calculated_host_listings_count,name,minimum_nights,latitude,room_type,price,availability_365,number_of_reviews,...,reviews_per_month,year,availability_365_normalized,number_of_reviews_normalized,reviews_per_month_normalized,market_activity,year_copy,covid_period,year_2019,year_2023
0,49602995,9582415,1,Single/Twin/Double Ensuite near Twickenham Sta...,1,51.44473,Private room,35.0,278,47,...,2.27,2019,0.760989,0.030599,0.014943,0.806531,2019,0,1,0
2,25587547,25796711,1,Stunning Central London Apartment Close The River,7,51.47398,Entire home/apt,103.0,317,8,...,0.51,2019,0.868132,0.005208,0.0,0.87334,2019,0,1,0
4,228095684,30374086,3,Lovely cosy flat for 4 in Heart of North London,3,51.59118,Entire home/apt,85.0,139,16,...,1.52,2019,0.379121,0.010417,0.008575,0.398113,2019,0,1,0
5,234271729,31300930,1,ALPINE GUEST HOUSE. Detached & self-contained,1,51.3796,Private room,32.0,140,50,...,5.17,2019,0.381868,0.032552,0.039565,0.453986,2019,0,1,0
6,246523174,34733139,2,room in the cottage,2,51.59863,Private room,30.0,157,15,...,2.69,2019,0.428571,0.009766,0.018509,0.456846,2019,0,1,0


In [26]:
df_filtered3 = pd.get_dummies(df_filtered3, columns=["room_type"], drop_first=True)

In [27]:
df_filtered3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 149746 entries, 0 to 369768
Data columns (total 25 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   host_id                         149746 non-null  int64         
 1   id                              149746 non-null  int64         
 2   calculated_host_listings_count  149746 non-null  int64         
 3   name                            149746 non-null  object        
 4   minimum_nights                  149746 non-null  int64         
 5   latitude                        149746 non-null  float64       
 6   price                           149746 non-null  float64       
 7   availability_365                149746 non-null  int64         
 8   number_of_reviews               149746 non-null  int64         
 9   longitude                       149746 non-null  float64       
 10  last_review                     149746 non-null  datetime64[n

In [28]:
start_date = pd.to_datetime('2020-03-17')
end_date = pd.to_datetime('2021-12-13')

# Create dummy variable rev_covid
df_filtered3['rev_covid'] = ((df_filtered3['last_review'] >= start_date) & (df_filtered3['last_review'] <= end_date)).astype(int)

In [29]:
df_filtered3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 149746 entries, 0 to 369768
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   host_id                         149746 non-null  int64         
 1   id                              149746 non-null  int64         
 2   calculated_host_listings_count  149746 non-null  int64         
 3   name                            149746 non-null  object        
 4   minimum_nights                  149746 non-null  int64         
 5   latitude                        149746 non-null  float64       
 6   price                           149746 non-null  float64       
 7   availability_365                149746 non-null  int64         
 8   number_of_reviews               149746 non-null  int64         
 9   longitude                       149746 non-null  float64       
 10  last_review                     149746 non-null  datetime64[n

In [30]:
df_filtered3 = df_filtered3.set_index(["id", "year_copy"])

In [31]:
df_filtered3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,host_id,calculated_host_listings_count,name,minimum_nights,latitude,price,availability_365,number_of_reviews,longitude,last_review,...,number_of_reviews_normalized,reviews_per_month_normalized,market_activity,covid_period,year_2019,year_2023,room_type_Hotel room,room_type_Private room,room_type_Shared room,rev_covid
id,year_copy,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
9582415,2019,49602995,1,Single/Twin/Double Ensuite near Twickenham Sta...,1,51.44473,35.0,278,47,-0.379,2019-11-06,...,0.030599,0.014943,0.806531,0,1,0,False,True,False,0
25796711,2019,25587547,1,Stunning Central London Apartment Close The River,7,51.47398,103.0,317,8,-0.21531,2019-11-06,...,0.005208,0.0,0.87334,0,1,0,False,False,False,0
30374086,2019,228095684,3,Lovely cosy flat for 4 in Heart of North London,3,51.59118,85.0,139,16,-0.1667,2019-11-06,...,0.010417,0.008575,0.398113,0,1,0,False,False,False,0
31300930,2019,234271729,1,ALPINE GUEST HOUSE. Detached & self-contained,1,51.3796,32.0,140,50,-0.27561,2019-11-06,...,0.032552,0.039565,0.453986,0,1,0,False,True,False,0
34733139,2019,246523174,2,room in the cottage,2,51.59863,30.0,157,15,-0.39746,2019-11-06,...,0.009766,0.018509,0.456846,0,1,0,False,True,False,0


In [32]:
df_filtered3["log_price"] = np.log(df_filtered3["price"])

In [33]:
# Selection of independent and dependent variables

independent_vars = [
    "market_activity",
    "covid_period",
    "year_2019",
    "year_2023",
    "room_type_Shared room",
    "room_type_Private room",'rev_covid'
]
dependent_var = "log_price"  # previously calculated composite indices

# Prepare the dependent and independent variables for the model
y = df_filtered3[dependent_var]
X = df_filtered3[independent_vars]

# Add constant term
X = sm.add_constant(X)

# Random effects modelling
model = RandomEffects(y, X)
results = model.fit()

# Output model results
print(results)

                        RandomEffects Estimation Summary                        
Dep. Variable:              log_price   R-squared:                        0.7910
Estimator:              RandomEffects   R-squared (Between):              0.4742
No. Observations:              149746   R-squared (Within):               0.2129
Date:                Sat, Dec 16 2023   R-squared (Overall):              0.4678
Time:                        21:49:57   Log-likelihood                    8848.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   8.094e+04
Entities:                      100468   P-value                           0.0000
Avg Obs:                       1.4905   Distribution:                F(7,149738)
Min Obs:                       1.0000                                           
Max Obs:                       5.0000   F-statistic (robust):          1.462e+04
                            

In [34]:
# Create the Data_clean folder
data_folder = "Data_clean"
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Export DataFrame to CSV file
output_file = os.path.join(data_folder, "cleaned_data_12061010.csv")
df_filtered3.to_csv(output_file, index=False)