<a href="https://colab.research.google.com/github/dernameistegal/airbnb_price/blob/main/data_utils/munich/picture_transformations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Preparation

In [None]:
#@title remove repos from disc
%cd /content
!rm -r airbnb_price

/content
rm: cannot remove 'airbnb_price': No such file or directory


In [None]:
#@title Clone repo
!git clone https://github.com/dernameistegal/airbnb_price.git

Cloning into 'airbnb_price'...
remote: Enumerating objects: 435, done.[K
remote: Counting objects: 100% (435/435), done.[K
remote: Compressing objects: 100% (408/408), done.[K
remote: Total 435 (delta 221), reused 117 (delta 21), pack-reused 0[K
Receiving objects: 100% (435/435), 3.27 MiB | 8.57 MiB/s, done.
Resolving deltas: 100% (221/221), done.


In [None]:
#@title add paths to library search path
import sys 

sys.path.append("/content/airbnb_price/custom_functions")

In [None]:
#@title Imports and drive
import os
import torch
import torchvision
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# own modules
import general_utils as gu
import picture_transformations_utils as pu



from google.colab import drive

#@title Mount drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
#@title define device

# device
device = gu.get_device()
num_cpus = os.cpu_count()
print(num_cpus, 'CPUs available')

cuda available: False ; cudnn available: True ; num devices: 0
Using device cpu
4 CPUs available


# 1. Data Cleaning Hostpics (Dont has to be run again)

In [None]:
hostpics_dir = "/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw"

In [None]:
#@title get file_names where pictures only have one channel

file_names = os.listdir(hostpics_dir)
files_one_channel = []

for file_name in file_names:
    if np.load(hostpics_dir + "/" + file_name).shape == (224, 224):
        files_one_channel.append(file_name)

100%|██████████| 11375/11375 [03:18<00:00, 57.39it/s] 


In [None]:
#@title convert one channel images to grey scale and overwrite original images for compatibility with neural net
for file_name in files_one_channel:
    file_path = "/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw" + file_name
    # add axis and repeat 3 times for 3 channels
    x = np.load(file_path)
    x = x[..., np.newaxis]
    x = np.repeat(x, 3, axis=2)

    # get max pixel value of image
    max_pixel_value = np.max(x)

    # scale pixel values to rgb range
    x = np.round(x * (255/max_pixel_value))
    x = x.astype(int)

    np.save(file_path, x)


In [None]:
#@title get file_names where pictures only have two channels

file_names = os.listdir(hostpics_dir)
files_two_channels = []

for file_name in file_names:
    if np.load(hostpics_dir + "/" + file_name).shape == (224, 224, 2):
        files_two_channels.append(file_name)

In [None]:
#@title convert two channel images to grey scale and overwrite original images for compatibility with neural net

for file_name in tqdm(files_two_channels):
    temp = np.load(hostpics_dir + "/" + file_name)
    temp = temp[..., 0]
    temp = temp[..., np.newaxis]
    temp = np.repeat(temp, 3, axis = 2)

    max_pixel_value = np.max(temp)

    temp = np.round(temp * (255 / max_pixel_value))
    temp = temp.astype(int)

    np.save(hostpics_dir + "/" + file_name, temp)

100%|██████████| 13/13 [00:00<00:00, 60.11it/s]


In [None]:
#@title get file_names where pictures have four channels

file_names = os.listdir(hostpics_dir)
files_four_channels = []

for file_name in tqdm(file_names):
    if np.load(hostpics_dir + "/" + file_name).shape == (224, 224, 4):
        files_four_channels.append(file_name)

100%|██████████| 11311/11311 [00:28<00:00, 398.50it/s]


In [None]:
#@title convert four channel images to three channel images and overwrite original images for compatibility with neural net

for file_name in tqdm(files_four_channels):
    temp = np.load(hostpics_dir + "/" + file_name)
    temp = temp[..., 0:3]
    np.save(hostpics_dir + "/" + file_name, temp)

100%|██████████| 51/51 [00:00<00:00, 51.47it/s]


# 2. Data Cleaning Thumbnails (Dont has to be run again)

In [None]:
thumbnails_dir = "/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw"

In [None]:
#@title get file_names where pictures only have one channel

file_names = os.listdir(thumbnails_dir)
files_one_channel = []

for file_name in tqdm(file_names):
    if np.load(thumbnails_dir + "/" + file_name).shape == (224, 224):
        files_one_channel.append(file_name)

100%|██████████| 11402/11402 [00:32<00:00, 352.73it/s]


In [None]:
#@title convert one channel images to grey scale and overwrite original images for compatibility with neural net

for file_name in tqdm(files_one_channel):
    temp = np.load(thumbnails_dir + "/" + file_name)
    temp = temp[..., np.newaxis]
    temp = np.repeat(temp, 3, axis = 2)

    max_pixel_value = np.max(temp)

    temp = np.round(temp * (255 / max_pixel_value))
    temp = temp.astype(int)

    np.save(thumbnails_dir + "/" + file_name, temp)

100%|██████████| 2/2 [00:00<00:00, 45.05it/s]


In [None]:
#@title get file_names where pictures  have four channels

file_names = os.listdir(thumbnails_dir)
files_four_channels = []

for file_name in tqdm(file_names):
    if np.load(thumbnails_dir + "/" + file_name).shape == (224, 224, 4):
        files_four_channels.append(file_name)

100%|██████████| 11402/11402 [00:42<00:00, 267.58it/s]


In [None]:
#@title convert four channel images to three channel images and overwrite original images for compatibility with neural net

for file_name in tqdm(files_four_channels):
    temp = np.load(thumbnails_dir + "/" + file_name)
    temp = temp[..., 0:3]
    np.save(thumbnails_dir + "/" + file_name, temp)

100%|██████████| 226/226 [00:05<00:00, 44.48it/s]


In [None]:
#@title save prices that correspond to all existing thumbnail pictures
listings_meta = pd.read_csv("/content/drive/MyDrive/Colab/airbnb/data/data1/listings.csv.gz")

with open("/content/drive/MyDrive/Colab/airbnb/data/missing_data.json", "r") as f:
    missing_data = json.load(f)

thumbnails_price = listings_meta[["id", "price"]][~listings_meta["id"].isin(missing_data["thumbnail"])]

# transform price
thumbnails_price["price"] = (thumbnails_price["price"].str.replace("$", "").str.replace(",", "").astype(float))

for i in tqdm(thumbnails_price.index):
    np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_response" + "/thumbnail" + str(thumbnails_price["id"][i]), thumbnails_price["price"][i])

# 3. Feature Extraction Hostpics

In [None]:
hostpics_dir = "/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw"

In [None]:
#@title  calculate moments of hostpicts and save them (dont has to be run again)
means, std = fu.calculate_channelwise_moments("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_raw")
means = means.reshape(3, 1)
std = std.reshape(3, 1)
hostpics_moments = np.hstack([means, std])
np.save("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_moments", hostpics_moments)

100%|██████████| 11375/11375 [02:43<00:00, 69.66it/s] 


In [None]:
# make dataset and dataloader with hostpics

# load moments
hostpics_moments = np.load("/content/drive/MyDrive/Colab/airbnb/data/hostpics/hostpics_moments.npy")
hostpics_moments = torch.from_numpy(hostpics_moments)

# initialize dataset and dataloader
dataset = pu.Dataset(filepath=hostpics_dir, channel_moments=hostpic_moments, ndata=10)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)


In [None]:
# extract features from pretrained model
vgg = torchvision.models.vgg19(pretrained=True)
feature_extractor = vgg.features[0:31]

# compute features for later training
train_features = pu.compute_train_features(device=device, dataloader=dataloader, feature_extractor=feature_extractor)
train_features = train.features.cpu().numpy()

In [None]:
# save features if desired
np.save()

torch.Size([10, 512, 14, 14])

# 4. Feature Extraction thumbnails

In [None]:
thumbnails_dir = "/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw"

In [None]:
#@title calculate moments of thumbnails and save them (dont has to be run again)
means, std = pu.calculate_channelwise_moments(thumbnails_dir)
means = means.reshape(3, 1)
std = std.reshape(3, 1)
thumbnails_moments = np.hstack([means, std])
np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_moments.npy", thumbnails_moments)

100%|██████████| 11402/11402 [01:39<00:00, 114.43it/s]


In [None]:
# make dataset and dataloader with hostpics

# load moments
thumbnails_moments = np.load("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_moments.npy")
thumbnails_moments = torch.from_numpy(thumbnails_moments)

# initialize dataset and dataloader
dataset = pu.Dataset(filepath=thumbnails_dir, channel_moments=thumbnails_moments, ndata=1000)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False)

In [None]:
# extract features from pretrained model
vgg = torchvision.models.vgg19(pretrained=True)
feature_extractor = vgg.features[0:31]

# compute features for later training
train_features = pu.compute_train_features(device=device, dataloader=dataloader, feature_extractor=feature_extractor)
train_features = train.features.cpu().numpy()

100%|██████████| 63/63 [00:40<00:00,  1.54it/s]


In [None]:
# save features if desired
np.save()

# 5. remove thumbnails that correspond to price zero (dont has to be run again)

In [None]:
# remove pictures that correspond to missing prices
with open("/content/drive/MyDrive/Colab/airbnb/data/data1/missing_data.json", "r") as f:
    missing_data = json.load(f)

filenames = os.listdir("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw")
removed = []

for filename in tqdm(filenames):
    if int(filename[9:-4]) in missing_data["price"]:
        os.remove("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/" + filename)
        removed.append(filename)

100%|██████████| 11402/11402 [00:00<00:00, 275825.51it/s]


In [None]:
# remove responses that correspond to missing prices
with open("/content/drive/MyDrive/Colab/airbnb/data/data1/missing_data.json", "r") as f:
    missing_data = json.load(f)

filenames = os.listdir("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_response")
removed = []

for filename in tqdm(filenames):
    if int(filename[9:-4]) in missing_data["price"]:
        os.remove("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_response/" + filename)
        removed.append(filename)

100%|██████████| 11402/11402 [00:00<00:00, 299967.10it/s]


In [None]:
len(os.listdir("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_response"))

11397

# 6. Add thumbnails manually that were not available

In [None]:
with open("/content/drive/MyDrive/Colab/airbnb/data/data1/missing_data.json", "r") as f:
    missing_thumbnails = json.load(f)["thumbnail"]

with open("/content/drive/MyDrive/Colab/airbnb/data/data1/missing_data.json", "r") as f:
    missing_price = json.load(f)["price"]

missing_thumbnails_index = pd.Index(missing_thumbnails)
missing_price_index = pd.Index(missing_price)

missing_thumbnails_index = missing_thumbnails_index.difference(missing_price_index)

listings_original = pd.read_pickle("/content/drive/MyDrive/Colab/airbnb/data/translations/translated_listings.pickle")
listings_original.set_index("id", inplace=True)
listings_original.loc[missing_thumbnails_index, :]

In [None]:
missing_thumbnails_index

Int64Index([10623784, 13499617, 15279015, 22341692, 29335901, 29337422,
            31159686],
           dtype='int64')

In [None]:
listings_original

In [None]:
# downloaded and resized manually the missing images
from PIL import Image

image1 = Image.open('/content/10623784.jpg')
image2 = Image.open('/content/13499617.jpg')
image3 = Image.open('/content/15279015.jpg')
image4 = Image.open('/content/22341692.jpg')
image5 = Image.open('/content/29335901.jpg')
image6 = Image.open('/content/29337422.jpg')
image7 = Image.open('/content/31159686.jpg')

In [None]:
np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/thumbnail10623784.npy", np.array(image1))
np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/thumbnail13499617.npy", np.array(image2))
np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/thumbnail15279015.npy", np.array(image3))
np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/thumbnail22341692.npy", np.array(image4))
np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/thumbnail29335901.npy", np.array(image5))
np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/thumbnail29337422.npy", np.array(image6))
np.save("/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw/thumbnail31159686.npy", np.array(image7))