<a href="https://colab.research.google.com/github/dernameistegal/airbnb_price/blob/main/data_utils/data_preparation/SavingDataInDrive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This simple introduction to the Airbnb data set(s) will give you a short overview over the available data. The city used for this introduction is Berlin, hence if you want to run the exact same notebook for a different city you would need to change a few minor details. Otherwise, if you have downloaded all necessary data sets and run this notebook in the same directory it should run smoothly.

### Index
1. Load data set
2. Price analysis
    * (Inspect reviews)
3. Main file (listings.csv.gz)
4. "Analyze" Images
5. "Analyze" Reviews
6. Calendar file
7.  neighbourhoods Geo.json file

In [2]:
#@title imports
%%capture
!pip install transformers
!pip install geopandas
import json
import os
import math
import pandas as pd
import gzip
from PIL import Image
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
import requests
from io import BytesIO
import matplotlib.image as mpimg
from tqdm import tqdm
from PIL import Image
import numpy as np

from shapely.geometry import Point, Polygon

import seaborn as sns

from transformers import pipeline

import folium
from folium.plugins import FastMarkerCluster
from branca.colormap import LinearColormap

In [3]:
#@title mount drive
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# make directories in drive
# !mkdir -p /content/drive/MyDrive/data/data1/
# !mkdir -p  /content/drive/MyDrive/data/hostpics/
# !mkdir -p  /content/drive/MyDrive/data/thumbnails/

In [None]:
# load data to drive
%%capture
!wget http://data.insideairbnb.com/austria/vienna/vienna/2021-11-07/data/listings.csv.gz
!wget http://data.insideairbnb.com/austria/vienna/vienna/2021-11-07/data/calendar.csv.gz
!wget http://data.insideairbnb.com/austria/vienna/vienna/2021-11-07/data/reviews.csv.gz
!wget http://data.insideairbnb.com/austria/vienna/vienna/2021-11-07/visualisations/listings.csv
!wget http://data.insideairbnb.com/austria/vienna/vienna/2021-11-07/visualisations/reviews.csv
!wget http://data.insideairbnb.com/austria/vienna/vienna/2021-11-07/visualisations/neighbourhoods.csv
!wget http://data.insideairbnb.com/austria/vienna/vienna/2021-11-07/visualisations/neighbourhoods.geojson

In [None]:
#read files and show header for overview
listings = pd.read_csv("listings.csv")
reviews = pd.read_csv("reviews.csv")
listings_meta = pd.read_csv("listings.csv.gz")
reviews_meta = pd.read_csv("reviews.csv.gz")
calendar = pd.read_csv("calendar.csv.gz")
listings_meta.set_index("id", inplace=True)

# Save Images (only run once)

In [None]:

# descriptive statistics for availability of pictures
n_no_hostpic = sum(listings_meta["host_picture_url"].isnull()) 
n_no_thumbnail = sum(listings_meta["picture_url"].isnull())
n_hosts_no_thumbnail = len(np.unique(listings_meta["host_id"][listings_meta["host_picture_url"].isnull()]))
print(f"{n_no_hostpic} listings have no hostpic. In total, {n_hosts_no_thumbnail} hosts have no hostpic. {n_no_thumbnail} listings have no thumbnail.")

22 listings have no hostpic. In total, 6 hosts have no hostpic. 0 listings have no thumbnail.


In [None]:
# instantiate list of all ids where url does not work
pic_malfunction = []

# save hostpics that are available
for i in range(1488, 1505):

    # get url
    test = listings_meta.loc[i]["host_picture_url"]
    break

    # check if url is not available
    if pd.isna(url):
        continue

    # scrape url
    response = requests.get(url)

    # check if url does not work
    try:
        img_plot = Image.open(BytesIO(response.content)).resize(IMAGE_SIZE)
    except:
        pic_malfunction.append(listings_meta.loc[i]["id"])
        continue
    
    # save rgb data
    rgb_data = np.array(img_plot)
    save_path = "/content/drive/MyDrive/data/hostpics/hostpic" + str(listings_meta.loc[i]["id"])
    np.save(save_path, rgb_data)

In [None]:
# save ids where host pics are not available (either no url or non-functioning url) in dictionary
nopic = np.unique(listings_meta["id"][listings_meta["host_picture_url"].isnull()])
nopic = list(nopic)
#indices = nopic + pic_malfunction

missing_data = {"hostpic": [int(ind) for ind in indices]}

temp_file = open("/content/drive/MyDrive/data/missing_data.json", "w")
json.dump(missing_data, temp_file)
temp_file.close()

#temp_file = open("missing_data.json", "r")
#output = json.load(temp_file)

In [None]:
# instantiate list of all ids where url does not work
pic_malfunction = []

# save thumbnails that are available
for i in range(len(listings_meta)):

    # get url
    url = listings_meta.loc[i]["picture_url"]

    # check if url is not available
    if pd.isna(url):
        continue

    # scrape url
    response = requests.get(url)

    # check if url does not work
    try:
        img_plot = Image.open(BytesIO(response.content)).resize(IMAGE_SIZE)
    except:
        pic_malfunction.append(listings_meta.loc[i]["id"])
        continue
    
    # save rgb data
    rgb_data = np.array(img_plot)
    save_path = "/content/drive/MyDrive/data/thumbnails/thumbnail" + str(listings_meta.loc[i]["id"])
    np.save(save_path, rgb_data)

  " Skipping tag %s" % (size, len(data), tag)


In [None]:
# save ids where thumbnails are not available in dictionary 
# various reasons, e.g. could not load because of corrupt exif data or image size

temp_file = open("/content/drive/MyDrive/data/missing_data.json", "r")
temp_file_dict = json.load(temp_file)
temp_file_dict["thumbnail"] = pic_malfunction
temp_file.close()

temp_file = open("/content/drive/MyDrive/data/missing_data.json", "w")
json.dump(temp_file_dict, temp_file)
temp_file.close()

#temp_file = open("missing_data.json", "r")
#output = json.load(temp_file)

# try to get urls (not necessary anymore)

In [None]:
#@title save hostpic urls
# does not have to be run again
urls = []

for index in tqdm(listings_meta.index):

    # get url_temp
    index = pd.Index([index])
    url_temp = listings_meta["host_picture_url"].loc[index].values[0]

    if not pd.isna(url_temp):
        url = requests.get(url_temp)
        print(url)
        urls.append(pd.Series([url], index=index))

    else:
        urls.append(pd.Series([np.nan], index=index))

temp_urls_series = pd.concat(urls, axis=0)
temp_urls_series.name = "temp_hostpics_url"
path = ("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_url.pickle")
temp_urls_series.to_pickle(path)

temp_urls_series = pd.read_pickle("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_url.pickle")

# does not have to be run again
links = []

for index in tqdm(temp_urls_series.index):
     
    index = pd.Index([index])
    obj = temp_urls_series.loc[index].values[0]

    if not pd.isna(obj):
        link = obj.url
        links.append(pd.Series([link], index=index))
    else:
        links.append(pd.Series([np.nan], index=index))

urls_series = pd.concat(links, axis=0)
urls_series.name = "hostpics_url"
urls_series = pd.concat([temp_urls_series, urls_series], axis=1)
path = ("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_url.pickle")
urls_series.to_pickle(path)

# append missing hostpics manually to missing_hostpics_binary (does not have to be run again)

In [None]:
with open("/content/drive/MyDrive/Colab/airbnb/data/data1/missing_data.json", "r") as f:
    missing_data = json.load(f)

In [None]:
missing_hostpic_index = pd.Index(missing_data["hostpic"])

In [None]:
urls_series = pd.read_pickle("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_url.pickle")
missing_hostpic_url_index = listings_meta.loc[urls_series.hostpics_url.isna()].index

In [None]:
# here links dont work
listings_meta.loc[missing_hostpic_index.difference(missing_hostpic_url_index)]

In [None]:
# nopics here where links did not work

In [None]:
# here url is missing entirely
listings_meta.loc[urls_series.hostpics_url.isna()]

In [None]:
# https://a0.muscache.com/im/users/14368032/profile_pic/1398641174/original.jpg? yes
# https://a0.muscache.com/im/users/6196667/profile_pic/1387179693/original.jpg?im_w=240 yes
# https://a0.muscache.com/im/pictures/user/61c74b99-2e58-4bc4-9e50-45f8b7d6e269.jpg?im_w=240 yes
# https://a0.muscache.com/im/pictures/user/04a419de-c13a-4511-9750-6133312e101a.jpg?im_w=240 yes
# https://a0.muscache.com/im/pictures/user/07077e9d-adb7-46c2-8d6d-c29e184f9a2b.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no
# https://a0.muscache.com/im/pictures/user/8e2de99c-a731-41f8-89eb-0a3d31608ae9.jpg?im_w=240 no

In [None]:
# create series with hostpic information of missing url or non-working urls

missing_hostpic_index.difference(missing_hostpic_url_index)
missing_hostpic_information0 = pd.Series(np.zeros(len(missing_hostpic_index.difference(missing_hostpic_url_index))), 
                                        index=missing_hostpic_index.difference(missing_hostpic_url_index)).astype("int")
missing_hostpic_information0.index.name = "id"
missing_hostpic_information1 = pd.Series(np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), index=missing_hostpic_url_index).astype("int")

In [None]:
missing_hostpic_information = pd.concat([missing_hostpic_information0, missing_hostpic_information1])
missing_hostpic_information.name = "host_picture_binary"
missing_hostpic_information = pd.DataFrame(missing_hostpic_information)

In [None]:
hostpics_binary = pd.read_pickle("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_binary.pickle")
hostpics_binary = hostpics_binary.append(missing_hostpic_information)

In [None]:
hostpics_binary.to_pickle("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_binary.pickle")

# save raw hostpics as jpg (does not have to be run again)

In [None]:
# change datatype of matrices in array format to allow for conversion to jpeg
for pic_name in tqdm(os.listdir("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw")):
    pic_path = "/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw/" + pic_name
    pic = np.load(pic_path)

    if not pic.dtype == "uint8":
        pic = pic.astype("uint8")
        np.save(pic_path, pic)

100%|██████████| 11375/11375 [01:34<00:00, 120.35it/s]


In [None]:
# save jpegs of pictures in array format
exceptions = []

for pic_name in tqdm(os.listdir("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw")):
    pic_path = "/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw/" + pic_name
    pic = np.load(pic_path)
    pic_name = pic_name[7:-4]

    try:
        pic = Image.fromarray(pic)
        pic.save("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw_jpeg/" + pic_name + ".jpg")
    except:
        exceptions.append("hostpic" + pic_name + ".npy") 
        print("exception occured with ", pic_name)


100%|██████████| 11375/11375 [1:16:35<00:00,  2.48it/s]


# save raw thumbnails as jpg (does not have to be run again)

In [4]:
# change datatype of matrices in array format to allow for conversion to jpeg
for pic_name in tqdm(os.listdir("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw")):
    pic_path = "/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/" + pic_name
    pic = np.load(pic_path)

    if not pic.dtype == "uint8":
        pic = pic.astype("uint8")
        np.save(pic_path, pic)

100%|██████████| 11404/11404 [06:01<00:00, 31.54it/s] 


In [5]:
# save jpegs of pictures in array format
exceptions = []

for pic_name in tqdm(os.listdir("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw")):
    pic_path = "/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/" + pic_name
    pic = np.load(pic_path)
    pic_name = pic_name[7:-4]

    try:
        pic = Image.fromarray(pic)
        pic.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw_jpeg/" + pic_name + ".jpg")
    except:
        exceptions.append("hostpic" + pic_name + ".npy") 
        print("exception occured with ", pic_name)

100%|██████████| 11404/11404 [02:49<00:00, 67.25it/s]
