# Data Preprocessing
---

## Importing Libraries


In [23]:
import pandas as pd
import json
import requests
import tqdm
import time
from requests.exceptions import ConnectionError, HTTPError
import math


## Dataset Description
Description of the available columns in the dataset:
- **realSum:** The total price of the Airbnb listing. (Numeric)
- **room_type:** The type of room being offered (e.g. private, shared, etc.). (Categorical)
- **room_shared:** Whether the room is shared or not. (Boolean)
- **room_private:** Whether the room is private or not. (Boolean)
- **person_capacity:** The maximum number of people that can stay in the room. (Numeric)
- **host_is_superhost:** Whether the host is a superhost or not. (Boolean)
- **multi:** Whether the listing is for multiple rooms or not. (Boolean)
- **biz:** Whether the listing is for business purposes or not. (Boolean)
- **cleanliness_rating:** The cleanliness rating of the listing. (Numeric)
- **guest_satisfaction_overall:** The overall guest satisfaction rating of the listing. (Numeric)
- **bedrooms:** The number of bedrooms in the listing. (Numeric)
- **dist:** The distance from the city center. (Numeric)
- **metro_dist:** The distance from the nearest metro station. (Numeric)
- **attr_index:** attraction index of the listing location (Numeric)
- **attr_index_norm:** 0-100 normalized attraction index (Numeric)
- **rest_index:** restaurant index of the listing location (Numeric)
- **rest_index_norm:** 0-100 normalized restaurant index (Numeric)
- **lng:** The longitude of the listing. (Numeric)
- **lat:** The latitude of the listing. (Numeric)
- **city:** The city of the listing. (Categorical)
- **time_of_week:** Whether the listing is for weekdays or the weekend. (Boolean)

In [24]:
raw_dataset_path = "./Dataset/Raw-Dataset/"
processed_dataset_path = "./Dataset/Processed-Dataset/"

amsterdam_weekday_dataset = raw_dataset_path + "amsterdam_weekdays.csv"

amsterdam_weekday_dataset_pandas = pd.read_csv(amsterdam_weekday_dataset)
amsterdam_weekday_dataset_pandas.describe()

Unnamed: 0.1,Unnamed: 0,realSum,person_capacity,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
count,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0,1103.0
mean,551.0,545.020526,2.792384,0.30825,0.115141,9.461469,94.362647,1.282865,2.841621,1.089439,271.009899,14.350154,341.541187,23.799081,4.891158,52.364858
std,318.552978,416.974314,1.044151,0.46198,0.319336,0.798201,6.089691,0.740178,2.123245,0.836546,197.04689,10.433764,236.611077,16.4874,0.038882,0.019467
min,0.0,128.887118,2.0,0.0,0.0,4.0,20.0,0.0,0.015059,0.03653,40.931415,2.167346,50.877318,3.545205,4.7755,52.2911
25%,275.5,309.797764,2.0,0.0,0.0,9.0,92.0,1.0,1.302058,0.462983,127.909866,6.772912,163.469245,11.390772,4.871,52.35458
50%,551.0,430.248635,2.0,0.0,0.0,10.0,96.0,1.0,2.341366,0.85601,208.180311,11.023286,260.257028,18.135084,4.89001,52.36559
75%,826.5,657.324303,4.0,1.0,0.0,10.0,98.0,2.0,3.648138,1.510629,386.442241,20.462373,469.290623,32.700846,4.907315,52.37526
max,1102.0,7782.907225,6.0,1.0,1.0,10.0,100.0,5.0,11.1871,4.411915,1888.550428,100.0,1435.102401,100.0,5.01077,52.42348


In [25]:
amsterdam_weekday_dataset_pandas.head()

Unnamed: 0.1,Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
0,0,194.033698,Private room,False,True,2.0,False,1,0,10.0,93.0,1,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772
1,1,344.245776,Private room,False,True,4.0,False,0,0,8.0,85.0,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432
2,2,264.101422,Private room,False,True,2.0,False,0,1,9.0,87.0,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103
3,3,433.529398,Private room,False,True,4.0,False,0,1,9.0,90.0,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663
4,4,485.552926,Private room,False,True,2.0,True,0,0,10.0,98.0,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508


In [26]:
amsterdam_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "amsterdam_weekends.csv")
athens_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "athens_weekdays.csv")
athens_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "athens_weekends.csv")
barcelona_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "barcelona_weekdays.csv")
barcelona_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "barcelona_weekends.csv")
berlin_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "berlin_weekdays.csv")
berlin_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "berlin_weekends.csv")
budapest_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "budapest_weekdays.csv")
budapest_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "budapest_weekends.csv")
lisbon_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "lisbon_weekdays.csv")
lisbon_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "lisbon_weekends.csv")
london_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "london_weekdays.csv")
london_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "london_weekends.csv")
paris_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "paris_weekdays.csv")
paris_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "paris_weekends.csv")
rome_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "rome_weekdays.csv")
rome_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "rome_weekends.csv")
vienna_weekday_dataset_pandas = pd.read_csv(raw_dataset_path + "vienna_weekdays.csv")
vienna_weekend_dataset_pandas = pd.read_csv(raw_dataset_path + "vienna_weekends.csv")

list_of_cities_weekend = [amsterdam_weekend_dataset_pandas, athens_weekend_dataset_pandas, barcelona_weekend_dataset_pandas, berlin_weekend_dataset_pandas, budapest_weekend_dataset_pandas, lisbon_weekend_dataset_pandas, london_weekend_dataset_pandas, paris_weekend_dataset_pandas, rome_weekend_dataset_pandas, vienna_weekend_dataset_pandas]

list_of_cities_weekday = [amsterdam_weekday_dataset_pandas, athens_weekday_dataset_pandas, barcelona_weekday_dataset_pandas, berlin_weekday_dataset_pandas, budapest_weekday_dataset_pandas, lisbon_weekday_dataset_pandas, london_weekday_dataset_pandas, paris_weekday_dataset_pandas, rome_weekday_dataset_pandas, vienna_weekday_dataset_pandas]

city_names = ["amsterdam", "athens", "barcelona", "berlin", "budapest", "lisbon", "london", "paris", "rome", "vienna"]

## Getting Location Information
Getting Information such as
- Nearest Public Transport
- Which Part of City it is in?

In [27]:
# Google API Setup
googleapi = json.load(open('secrets.json'))
googleapi = googleapi["secret"]

In [28]:
# Function to make a request with retries
def make_request_with_retries(url, backoff_factor=0.3):
    while True:
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            return response
        except (ConnectionError, HTTPError) as e:
            print(f"Error: {e}. Retrying...")
            time.sleep(backoff_factor)


In [29]:
# Iterate over the rows of the dataframe for Weekends
for i, panda in enumerate(list_of_cities_weekend):
    city = city_names[i]
    print("Processing city: " + city_names[i])
    panda['district'] = "Unknown"
    for index, row in tqdm.tqdm(panda.iterrows(), total=panda.shape[0], desc="Processing rows", leave=True):
        latitute = row['lat']
        longitude = row['lng']
        request_api = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={latitute:.6f},{longitude:.6f}&key={googleapi}"
        
        # Make the request with retries
        response = make_request_with_retries(request_api)
        response_json = response.json()
        
        # Get the District / Neighbourhood
        district = "Unknown"
        if city in ["amsterdam", "barcelona", "berlin", "budapest", "paris", "rome", "vienna"]:
            for result in response_json['results']:
                for address_component in result['address_components']:
                    if 'sublocality' in address_component['types'] or "sublocality_level_1" in address_component['types']:
                        district = address_component['long_name']
                        break
        elif city in ["athens", "lisbon", "london"]:
            for result in response_json['results']:
                for address_component in result['address_components']:
                    if 'administrative_area_level_3' in address_component['types']:
                        district = address_component['long_name']
                        break
        
        # Add the district to the dataframe
        panda.at[index, 'district'] = district
    
    # Save the dataframe
    path = processed_dataset_path + city_names[i] + "_weekends.csv"
    panda.to_csv(path, index=False)
    break
    

Processing city: amsterdam


Processing rows: 100%|██████████| 977/977 [03:28<00:00,  4.69it/s]


In [None]:
amsterdam_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "amsterdam_weekends.csv")
athens_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "athens_weekends.csv")
barcelona_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "barcelona_weekends.csv")
berlin_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "berlin_weekends.csv")
budapest_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "budapest_weekends.csv")
lisbon_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "lisbon_weekends.csv")
london_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "london_weekends.csv")
paris_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "paris_weekends.csv")
rome_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "rome_weekends.csv")
vienna_weekend_dataset_pandas = pd.read_csv(processed_dataset_path + "vienna_weekends.csv")

In [None]:
# Print out all the unique Districts for each city
print("Unique Districts for Amsterdam")
print(amsterdam_weekend_dataset_pandas['district'].unique())
print("Unique Districts for Athens")
print(athens_weekend_dataset_pandas['district'].unique())
print("Unique Districts for Barcelona")
print(barcelona_weekend_dataset_pandas['district'].unique())
print("Unique Districts for Berlin")
print(berlin_weekend_dataset_pandas['district'].unique())
print("Unique Districts for Budapest")
print(budapest_weekend_dataset_pandas['district'].unique())
print("Unique Districts for Lisbon")
print(lisbon_weekend_dataset_pandas['district'].unique())
print("Unique Districts for London")
print(london_weekend_dataset_pandas['district'].unique())
print("Unique Districts for Paris")
print(paris_weekend_dataset_pandas['district'].unique())
print("Unique Districts for Rome")
print(rome_weekend_dataset_pandas['district'].unique())
print("Unique Districts for Vienna")
print(vienna_weekend_dataset_pandas['district'].unique())

Unique Districts for Amsterdam
['Amsterdam-Zuid' 'Amsterdam-Oost' 'Amsterdam-Centrum' 'Westpoort'
 'Amsterdam-West' 'Amsterdam-Noord' 'Amsterdam Nieuw-West'
 'Amsterdam-Zuidoost']


In [58]:
# Add the Districts also for the Weekdays Dataset
raw_dataset_path = "./Dataset/Raw-Dataset/"
processed_dataset_path = "./Dataset/Processed-Dataset/"
for city in city_names:
    weekday_city_dataset = pd.read_csv(raw_dataset_path + city + "_weekdays.csv")
    weekend_city_dataset = pd.read_csv(processed_dataset_path + city + "_weekends.csv")
    print("City: " + city)
    print(raw_dataset_path + city + "_weekdays.csv")
    print(processed_dataset_path + city + "_weekends.csv")
    # print("Weekday Dataset Shape: " + str(weekday_city_dataset.shape))
    # print("Weekend Dataset Shape: " + str(weekend_city_dataset.shape))
    
    weekday_city_dataset['district'] = "Unknown"
    for i, r in tqdm.tqdm(weekday_city_dataset.iterrows(), total=weekday_city_dataset.shape[0], desc="Processing rows", leave=True):
        for index, row in weekend_city_dataset.iterrows():
            if math.isclose(row['lat'], r['lat'], abs_tol=1e-5) and math.isclose(row['lng'], r['lng'], abs_tol=1e-5):
                weekday_city_dataset.at[i, 'district'] = row['district']
                break


    
    # Drop the rows with 
    weekday_city_dataset = weekday_city_dataset[weekday_city_dataset['district'] != "Unknown"]
    
    path = processed_dataset_path + city + "_weekdays.csv"
    weekday_city_dataset.to_csv(path, index=False)
    weekday_city_dataset = pd.read_csv(path)
    print("Weekday Dataset Shape: " + str(weekday_city_dataset.shape))
    print("Weekend Dataset Shape: " + str(weekend_city_dataset.shape))
    print("\n")

City: amsterdam
./Dataset/Raw-Dataset/amsterdam_weekdays.csv
./Dataset/Processed-Dataset/amsterdam_weekends.csv


Processing rows: 100%|██████████| 1103/1103 [00:11<00:00, 96.37it/s]


Weekday Dataset Shape: (712, 21)
Weekend Dataset Shape: (977, 21)


City: athens
./Dataset/Raw-Dataset/athens_weekdays.csv
./Dataset/Processed-Dataset/athens_weekends.csv


Processing rows: 100%|██████████| 2653/2653 [01:05<00:00, 40.26it/s]


Weekday Dataset Shape: (2121, 21)
Weekend Dataset Shape: (2627, 21)


City: barcelona
./Dataset/Raw-Dataset/barcelona_weekdays.csv
./Dataset/Processed-Dataset/barcelona_weekends.csv


Processing rows: 100%|██████████| 1555/1555 [00:22<00:00, 69.02it/s]


Weekday Dataset Shape: (968, 21)
Weekend Dataset Shape: (1278, 21)


City: berlin
./Dataset/Raw-Dataset/berlin_weekdays.csv
./Dataset/Processed-Dataset/berlin_weekends.csv


Processing rows: 100%|██████████| 1284/1284 [00:15<00:00, 82.83it/s]


Weekday Dataset Shape: (957, 21)
Weekend Dataset Shape: (1200, 21)


City: budapest
./Dataset/Raw-Dataset/budapest_weekdays.csv
./Dataset/Processed-Dataset/budapest_weekends.csv


Processing rows: 100%|██████████| 2074/2074 [00:41<00:00, 50.22it/s]


Weekday Dataset Shape: (1395, 21)
Weekend Dataset Shape: (1948, 21)


City: lisbon
./Dataset/Raw-Dataset/lisbon_weekdays.csv
./Dataset/Processed-Dataset/lisbon_weekends.csv


Processing rows: 100%|██████████| 2857/2857 [01:16<00:00, 37.23it/s]


Weekday Dataset Shape: (2244, 21)
Weekend Dataset Shape: (2906, 21)


City: london
./Dataset/Raw-Dataset/london_weekdays.csv
./Dataset/Processed-Dataset/london_weekends.csv


Processing rows: 100%|██████████| 4614/4614 [04:15<00:00, 18.04it/s]


Weekday Dataset Shape: (4014, 21)
Weekend Dataset Shape: (5379, 21)


City: paris
./Dataset/Raw-Dataset/paris_weekdays.csv
./Dataset/Processed-Dataset/paris_weekends.csv


Processing rows: 100%|██████████| 3130/3130 [01:55<00:00, 27.09it/s]


Weekday Dataset Shape: (2610, 21)
Weekend Dataset Shape: (3558, 21)


City: rome
./Dataset/Raw-Dataset/rome_weekdays.csv
./Dataset/Processed-Dataset/rome_weekends.csv


Processing rows: 100%|██████████| 4492/4492 [03:28<00:00, 21.57it/s]


Weekday Dataset Shape: (3506, 21)
Weekend Dataset Shape: (4535, 21)


City: vienna
./Dataset/Raw-Dataset/vienna_weekdays.csv
./Dataset/Processed-Dataset/vienna_weekends.csv


Processing rows: 100%|██████████| 1738/1738 [00:32<00:00, 52.82it/s]

Weekday Dataset Shape: (1380, 21)
Weekend Dataset Shape: (1799, 21)







In [None]:
# Remove the Listings which are not in the Weekdays for Weekends
for city in city_names:
    weekend_city_dataset = pd.read_csv(processed_dataset_path + city + "_weekends.csv")
    weekday_city_dataset = pd.read_csv(processed_dataset_path + city + "_weekdays.csv")
    
    # Remove the listings which are not in the Weekdays and match them with lat ang lng at the same time
    weekend_city_dataset = weekend_city_dataset[weekend_city_dataset.apply(lambda x: (weekday_city_dataset['lat'] == x['lat']) & (weekday_city_dataset['lng'] == x['lng']), axis=1)]
    
    print("\n")
            
            

(712, 21)
(977, 21)


(2121, 21)
(2627, 21)


(968, 21)
(1278, 21)


(957, 21)
(1200, 21)


(1395, 21)
(1948, 21)


(2244, 21)
(2906, 21)


(4014, 21)
(5379, 21)


(2610, 21)
(3558, 21)


(3506, 21)
(4535, 21)


(1380, 21)
(1799, 21)


