# Proyecto final: módulo de deep learning

## Descarga de datos e imágenes

In [None]:
! pip install -q kaggle

In [None]:
from google.colab import files

files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

In [None]:
! kaggle datasets download stevezhenghp/airbnb-price-prediction

Downloading airbnb-price-prediction.zip to /content
 80% 25.0M/31.3M [00:00<00:00, 49.7MB/s]
100% 31.3M/31.3M [00:00<00:00, 53.8MB/s]


In [None]:
! unzip airbnb-price-prediction

Archive:  airbnb-price-prediction.zip
  inflating: train.csv               


In [None]:
import cv2
import numpy
import pandas
import imageio.v3 as io
import matplotlib.pyplot as plt

from tqdm import tqdm

from typing import Optional, Union

In [None]:
data = pandas.read_csv("train.csv", sep = ',')

data.head(5)

In [None]:
data = data.dropna(subset=['log_price'])

In [None]:
# Convertimos los logs de precios en precios
data["Price"] = numpy.exp(data["log_price"])

In [None]:
# ¿De qué variables disponemos?
data.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')

En esta propuesta emplearemos una perspectiva naíf, imaginando que no tenemos demasiado conocimiento acerca de qué variables resultan de mayor interés para la tarea y delegando ese trabajo en nuestros modelos.

In [None]:
def download_images(paths: list,
                    canvas: tuple = (224, 224),
                    nb_channels: int = 3,
                    max_imgs: Optional[int] = None
                    ) -> tuple:
  """ Download a list of images from url adresses, converting them to a specific
  canvas size.

  Args:
    paths: Paths or url adresses from which to load images.
    canvas: Desired image width and height.
    nb_channels: Channels in images (1 for B/W, 3 for RGB).
    max_imgs: Upper threshold in the number of images to download.

  Return:
    a tuple of:
      - image values
      - indices within the paths that were successfull.

  """
  n_images = len(paths) if not max_imgs else max_imgs
  images = numpy.zeros((n_images, canvas[0], canvas[1], nb_channels),
                       dtype=numpy.uint8)
  downloaded_idxs = []

  for i_img, url in enumerate(tqdm(paths, total=n_images)):
    if i_img >= n_images:
      break
    try:
        img = io.imread(url)
        img = cv2.resize(img, (canvas[0], canvas[1]))
        downloaded_idxs.append(i_img)
        images[i_img] = img
    except (IOError, ValueError) as e:  # Unavailable url / conversion error
        pass
  return images[downloaded_idxs], downloaded_idxs

### Posible carga de imágenes y datos desde GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pandas.read_csv("/content/drive/MyDrive/keepcoding_DL/data_KC.csv", sep=';')
images = numpy.load("/content/drive/MyDrive/keepcoding_DL/images_KC.npy")
data.shape, images.shape

((2135, 30), (2135, 224, 224, 3))

### O descargamos

Vamos a limitar el número de imágenes a un máximo de 3000 intentos de descarga (alrededor de 15min de proceso). Con ello comenzamos a explorar el espacio de grandes cantidades de datos necesario para lograr aprovechar las propiedades de los sistemas Deep Learning a la par que no desaprovechamos horas en la decarga y tratamiento de los datos.

In [None]:
images, idxs = download_images(data['thumbnail_url'], max_imgs=3000)
images = images.astype("float32") / 255.
images.shape

In [None]:
data = data.iloc[idxs]
data = data.reset_index(drop=True)
data.head(5)

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
3,12422935,4.442651,Apartment,Private room,"{TV,""Wireless Internet"",Heating,""Smoke detecto...",2,1.0,Real Bed,strict,True,...,37.753164,-122.429526,Comfort Suite San Francisco,Noe Valley,3,100.0,https://a0.muscache.com/im/pictures/82509143-4...,94131.0,1.0,1.0
4,11825529,4.418841,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",3,1.0,Real Bed,moderate,True,...,33.980454,-118.462821,Beach Town Studio and Parking!!!11h,,15,97.0,https://a0.muscache.com/im/pictures/4c920c60-4...,90292.0,1.0,1.0


In [None]:
data.head(5)

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds,Price
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0,150.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0,169.0
2,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0,750.0
3,12422935,4.442651,Apartment,Private room,"{TV,""Wireless Internet"",Heating,""Smoke detecto...",2,1.0,Real Bed,strict,True,...,-122.429526,Comfort Suite San Francisco,Noe Valley,3,100.0,https://a0.muscache.com/im/pictures/82509143-4...,94131.0,1.0,1.0,85.0
4,11825529,4.418841,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",3,1.0,Real Bed,moderate,True,...,-118.462821,Beach Town Studio and Parking!!!11h,,15,97.0,https://a0.muscache.com/im/pictures/4c920c60-4...,90292.0,1.0,1.0,83.0


In [None]:
# Guardamos en GDrive
numpy.save('images_KC.npy', images)
data.to_csv('data_KC.csv', sep=';', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp images_KC.npy /content/drive/MyDrive/keepcoding_DL/images_KC.npy
!cp data_KC.csv /content/drive/MyDrive/keepcoding_DL/data_KC.csv

!ls -lah images* data*  # Comprobación

-rw-r--r-- 1 root root 2.8M Apr 22 10:23 data_KC.csv
-rw-r--r-- 1 root root 1.2G Apr 22 10:23 images_KC.npy
