In [29]:
import pandas as pd
import json
import numpy as np
import seaborn as sn
import pickle

from matplotlib import pyplot as plt
from io import StringIO
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, f1_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [30]:
df_listings_details = pd.read_csv('../data/listings_detailed.csv')
df_listings = pd.read_csv('../data/listings.csv')

In [31]:
pd.set_option('display.max_columns', None)

In [32]:
df_listings_details.head(2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,availability_eoy,number_of_reviews_ly,estimated_occupancy_l365d,estimated_revenue_l365d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,40625,https://www.airbnb.com/rooms/40625,20250914152932,2025-09-14,city scrape,"Near Palace Schönbrunn, Apt. 1",Welcome to my Apt. 1!<br /><br />This is a 2be...,The neighbourhood offers plenty of restaurants...,https://a0.muscache.com/pictures/11509144/d55c...,175131,https://www.airbnb.com/users/show/175131,Ingela,2010-07-20,"Vienna, Austria",I´m originally from Sweden but have been livin...,within an hour,100%,95%,,https://a0.muscache.com/im/users/175131/profil...,https://a0.muscache.com/im/users/175131/profil...,Rudolfsheim-Fünfhaus,15.0,18.0,"['email', 'phone']",t,t,"Vienna, Austria",Rudolfsheim-Fnfhaus,,48.18434,16.32701,Entire rental unit,Entire home/apt,6,1.0,1 bath,2.0,3.0,"[""Smoke alarm"", ""Pack \u2019n play/Travel crib...",$140.00,1,365,1.0,30.0,365.0,365.0,16.3,365.0,,t,18,34,61,137,2025-09-14,225,8,1,69,15,48,6720.0,2010-08-04,2025-08-20,4.86,4.91,4.89,4.89,4.94,4.61,4.73,,f,14,13,1,0,1.22
1,51287,https://www.airbnb.com/rooms/51287,20250914152932,2025-09-14,city scrape,little studio- next to citycenter- wifi- nice ...,small studio in new renovated old house and ve...,The neighbourhood has a lot of very nice littl...,https://a0.muscache.com/pictures/25163038/1c4e...,166283,https://www.airbnb.com/users/show/166283,Hannes,2010-07-14,"Vienna, Austria",I am open minded and like travelling myself. I...,within an hour,100%,100%,t,https://a0.muscache.com/im/users/166283/profil...,https://a0.muscache.com/im/users/166283/profil...,Leopoldstadt,2.0,3.0,"['email', 'phone']",t,t,"Vienna, Austria",Leopoldstadt,,48.21778,16.37847,Entire rental unit,Entire home/apt,2,1.0,1 bath,0.0,2.0,"[""Smoke alarm"", ""Stove"", ""Coffee maker"", ""Drye...",$71.00,30,180,30.0,30.0,180.0,180.0,30.0,180.0,,t,0,0,27,207,2025-09-14,383,3,0,46,9,180,12780.0,2011-01-27,2025-07-04,4.67,4.78,4.51,4.92,4.95,4.87,4.59,,f,2,2,0,0,2.15


In [8]:
listings_selected_columns = ['id', 'name_x', 'room_type_x', 'minimum_nights_y', 
   'availability_eoy', 'availability_365_x', 
    'estimated_occupancy_l365d', 'estimated_revenue_l365d',
    'number_of_reviews_y', 'number_of_reviews_l30d', 'reviews_per_month_y', 
    'review_scores_rating', 'review_scores_accuracy','review_scores_checkin', 'review_scores_cleanliness',
    'review_scores_communication', 'review_scores_location', 'review_scores_value', 
    'instant_bookable', 'calculated_host_listings_count_y', 'price_x']


In [10]:
listings_selected_columns = ['id', 'name', 'room_type', 'minimum_nights', 
   'availability_eoy', 'availability_365', 
    'estimated_occupancy_l365d', 'estimated_revenue_l365d',
    'number_of_reviews', 'number_of_reviews_l30d', 'reviews_per_month', 
    'review_scores_rating', 'review_scores_accuracy','review_scores_checkin', 'review_scores_cleanliness',
    'review_scores_communication', 'review_scores_location', 'review_scores_value', 
    'instant_bookable', 'calculated_host_listings_count', 'price']


In [33]:
df_merged_listings = pd.concat([df_listings, df_listings_details], axis=1)

In [34]:
df_regression_master = df_regression_master.loc[:, ~df_regression_master.columns.duplicated()]

In [37]:
listings_selected_columns = ['room_type', 'minimum_nights', 
   'availability_eoy', 'availability_365', 
    'estimated_occupancy_l365d', 'estimated_revenue_l365d',
    'number_of_reviews', 'number_of_reviews_l30d', 'reviews_per_month', 
    'review_scores_rating', 'review_scores_value', 
    'instant_bookable', 'calculated_host_listings_count', 'price']


In [38]:
df_regression_master = df_merged_listings[listings_selected_columns]

In [41]:
df_regression_master.head(1)

Unnamed: 0,room_type,room_type.1,minimum_nights,minimum_nights.1,availability_eoy,availability_365,availability_365.1,estimated_occupancy_l365d,estimated_revenue_l365d,number_of_reviews,number_of_reviews.1,number_of_reviews_l30d,reviews_per_month,reviews_per_month.1,review_scores_rating,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count.1,price,price.1
0,Entire home/apt,Entire home/apt,1,1,69,137,137,48,6720.0,225,225,1,1.22,1.22,4.86,4.73,f,14,14,140.0,$140.00


In [5]:
def data_cleanup(df, new_columns):
    df_regression_master = df[new_columns]
    df_regression_master = df_regression_master.loc[:, ~df_regression_master.columns.duplicated()]
    
    categorical= list(df_regression_master.dtypes[df_regression_master.dtypes=='object'].index)
    numerical = list(df_regression_master.dtypes[df_regression_master.dtypes!='object'].index)
    
    for c in categorical:
        df_regression_master = df_regression_master[df_regression_master[c].notna()]
        
    for c in numerical:
        df_regression_master.loc[:,c] = df_regression_master.loc[:,c].fillna(0)
    
    return df_regression_master

In [6]:
df_merged_listings = pd.concat([df_listings, df_listings_details], axis=1)

In [11]:
df_regression_master = data_cleanup(df_merged_listings, listings_selected_columns)

In [12]:
df_regression_master.head(2)

Unnamed: 0,id,name,room_type,minimum_nights,availability_eoy,availability_365,estimated_occupancy_l365d,estimated_revenue_l365d,number_of_reviews,number_of_reviews_l30d,reviews_per_month,review_scores_rating,review_scores_accuracy,review_scores_checkin,review_scores_cleanliness,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,price
0,40625,"Near Palace Schönbrunn, Apt. 1",Entire home/apt,1,69,137,48,6720.0,225,1,1.22,4.86,4.91,4.89,4.89,4.94,4.61,4.73,f,14,140.0
1,51287,little studio- next to citycenter- wifi- nice ...,Entire home/apt,30,46,207,180,12780.0,383,0,2.15,4.67,4.78,4.92,4.51,4.95,4.87,4.59,f,2,71.0


In [13]:
df_regression_master.columns = df_regression_master.columns.str.lower()
df_regression_master.columns = df_regression_master.columns.str.replace(' ','_')

In [14]:
df_regression_master.head(2)

Unnamed: 0,id,name,room_type,minimum_nights,availability_eoy,availability_365,estimated_occupancy_l365d,estimated_revenue_l365d,number_of_reviews,number_of_reviews_l30d,reviews_per_month,review_scores_rating,review_scores_accuracy,review_scores_checkin,review_scores_cleanliness,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,price
0,40625,"Near Palace Schönbrunn, Apt. 1",Entire home/apt,1,69,137,48,6720.0,225,1,1.22,4.86,4.91,4.89,4.89,4.94,4.61,4.73,f,14,140.0
1,51287,little studio- next to citycenter- wifi- nice ...,Entire home/apt,30,46,207,180,12780.0,383,0,2.15,4.67,4.78,4.92,4.51,4.95,4.87,4.59,f,2,71.0


In [20]:
interested_features = ['minimum_nights', 'availability_eoy','number_of_reviews', 'number_of_reviews_l30d', 'reviews_per_month', 'review_scores_rating', 'instant_bookable', 'calculated_host_listings_count']
target = 'price'

In [21]:
X_full = df_regression_master[interested_features]
y_full = df_regression_master[target]

X_train_val, X_test, y_train_val, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)
len(X_train), len(X_val), len(X_test)

(8473, 2825, 2825)

In [22]:
lin_reg = LinearRegression()

In [23]:
lin_reg.fit(X_train, y_train)

ValueError: could not convert string to float: 't'