# Data Visualization
# *Project 1*

### Group AA
Diana Furtado (m20200590@novaims.unl.pt); 
Hiromi Nakashima (m20201025@novaims.unl.pt); 
Miguel Martins (m20200671@novaims.unl.pt); 
Sofia Simão (m20200639@novaims.unl.pt)

#### This project aims to analyse *AIRBNB* dataset and understand how COVID-19 impacted Lisbon's tourism


*Dataset source*: http://insideairbnb.com/get-the-data.html


In [3]:
#Import libraries 
import pandas as pd
import numpy as np
import scipy
import sklearn
import os
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
import glob

## AIRBNB Dataset

### Data Exploration

In [4]:
# Import all csv files and concatenate - Listings
path = r'AIRBNB/listings' 
all_files = glob.glob(os.path.join(path, "*.csv")) 

df_from_each_file = (pd.read_csv(f) for f in all_files)
listings = pd.concat(df_from_each_file, ignore_index=True)

In [5]:
listings

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,6499,Belém 1 Bedroom Historical Apartment,14455,Bruno,Lisboa,Belm,38.697500,-9.197680,Entire home/apt,79,3,26,2020-01-03,0.36,1,242
1,25659,Heart of Alfama - Coeur d'Alfama - Lisbon Center,107347,Ellie,Lisboa,Santa Maria Maior,38.711670,-9.126960,Entire home/apt,45,3,113,2019-12-08,1.46,1,365
2,29248,Apartamento Alfama com vista para o rio!,125768,Bárbara,Lisboa,Santa Maria Maior,38.712720,-9.126280,Entire home/apt,43,1,322,2020-06-14,2.74,1,329
3,29396,Alfama Hill - Boutique apartment,126415,Mónica,Lisboa,Santa Maria Maior,38.712390,-9.128870,Entire home/apt,44,2,247,2020-08-23,2.45,2,331
4,29915,Modern and Cool Apartment in Lisboa,128890,Sara,Lisboa,Avenidas Novas,38.747120,-9.152860,Entire home/apt,48,5,37,2020-01-21,0.30,1,293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507591,45505941,Stylish & Quiet Room with PrivateWC- Alameda IST,23266121,Joana & Francisco,Lisboa,Arroios,38.736281,-9.131548,Private room,45,30,0,,,6,0
507592,45506121,Stylish & Spacious Room w PrivateWC- Alameda IST,23266121,Joana & Francisco,Lisboa,Arroios,38.735840,-9.133190,Private room,45,30,0,,,6,0
507593,45506284,Queen Bed w Balcony & PrivateWc - Alameda (IST),23266121,Joana & Francisco,Lisboa,Arroios,38.735527,-9.132067,Private room,50,30,0,,,6,0
507594,45506608,Quiet & Comfortable room w PrivateWC - Alameda...,23266121,Joana & Francisco,Lisboa,Arroios,38.735527,-9.132067,Private room,45,29,0,,,6,262


In [52]:
# Import all csv files and concatenate - reviews
path = r'AIRBNB/reviews'
all_files = glob.glob(os.path.join(path, "*.csv"))

df_from_each_file = (pd.read_csv(f) for f in all_files)
reviews = pd.concat(df_from_each_file, ignore_index=True)

In [53]:
reviews

Unnamed: 0,listing_id,date
0,6499,2014-09-02
1,6499,2014-10-11
2,6499,2015-01-02
3,6499,2015-07-11
4,6499,2015-08-08
...,...,...
20485546,45325795,2020-09-15
20485547,45328565,2020-09-09
20485548,45328565,2020-09-17
20485549,45400563,2020-09-19


In [54]:
# Export full datasets - listings and reviews
listings.to_csv('listings.csv', index=False)
reviews.to_csv('reviews.csv', index=False)

In [55]:
# Adds a column with the listing frequency
reviews['freq'] = reviews.groupby('listing_id')['listing_id'].transform('count')
reviews

Unnamed: 0,listing_id,date,freq
0,6499,2014-09-02,421
1,6499,2014-10-11,421
2,6499,2015-01-02,421
3,6499,2015-07-11,421
4,6499,2015-08-08,421
...,...,...,...
20485546,45325795,2020-09-15,17
20485547,45328565,2020-09-09,55
20485548,45328565,2020-09-17,55
20485549,45400563,2020-09-19,5


In [56]:
listings.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,6499,Belém 1 Bedroom Historical Apartment,14455,Bruno,Lisboa,Belm,38.6975,-9.19768,Entire home/apt,79,3,26,2020-01-03,0.36,1,242
1,25659,Heart of Alfama - Coeur d'Alfama - Lisbon Center,107347,Ellie,Lisboa,Santa Maria Maior,38.71167,-9.12696,Entire home/apt,45,3,113,2019-12-08,1.46,1,365
2,29248,Apartamento Alfama com vista para o rio!,125768,Bárbara,Lisboa,Santa Maria Maior,38.71272,-9.12628,Entire home/apt,43,1,322,2020-06-14,2.74,1,329


In [57]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507596 entries, 0 to 507595
Data columns (total 16 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              507596 non-null  int64  
 1   name                            507205 non-null  object 
 2   host_id                         507596 non-null  int64  
 3   host_name                       507410 non-null  object 
 4   neighbourhood_group             507596 non-null  object 
 5   neighbourhood                   507596 non-null  object 
 6   latitude                        507596 non-null  float64
 7   longitude                       507596 non-null  float64
 8   room_type                       507596 non-null  object 
 9   price                           507596 non-null  int64  
 10  minimum_nights                  507596 non-null  int64  
 11  number_of_reviews               507596 non-null  int64  
 12  last_review     

In [58]:
listings.isna().sum()

id                                    0
name                                391
host_id                               0
host_name                           186
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       84415
reviews_per_month                 84415
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [59]:
listings.duplicated().sum()

41270

In [60]:
listings.iloc[:,8:].describe(include="all")

Unnamed: 0,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
count,507596,507596.0,507596.0,507596.0,423181,423181.0,507596.0,507596.0
unique,4,,,,1784,,,
top,Entire home/apt,,,,2020-01-02,,,
freq,373761,,,,3665,,,
mean,,99.364944,3.089043,40.357984,,1.525256,14.421331,224.607761
std,,247.269182,12.582492,63.683562,,1.51866,39.704689,128.813277
min,,0.0,1.0,0.0,,0.01,1.0,0.0
25%,,45.0,1.0,2.0,,0.35,1.0,115.0
50%,,65.0,2.0,13.0,,1.01,3.0,267.0
75%,,100.0,3.0,51.0,,2.3,10.0,343.0


In [61]:
from pandas_profiling import ProfileReport
prof = ProfileReport(listings)
prof.to_file(output_file='airbnb_listings.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=30.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [62]:
reviews.head(3)

Unnamed: 0,listing_id,date,freq
0,6499,2014-09-02,421
1,6499,2014-10-11,421
2,6499,2015-01-02,421


In [63]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20485551 entries, 0 to 20485550
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   listing_id  int64 
 1   date        object
 2   freq        int64 
dtypes: int64(2), object(1)
memory usage: 468.9+ MB


In [64]:
reviews.isna().sum()

listing_id    0
date          0
freq          0
dtype: int64

In [65]:
prof_ar = ProfileReport(reviews)
prof_ar.to_file(output_file='airbnb_reviews.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=17.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…


