# Necessary libraries and credentials

In [1]:
import os 
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import json
import boto3
import pandas as pd
from sqlalchemy import create_engine, text, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image

import psycopg2

In [2]:
# s3 credentials
elo_s3_accessKeys = pd.read_csv("C:/Users/elodi/Documents/Documents/PYTHON/Jedha/data_science_full_stack/elo_s3_accessKeys.csv",encoding='utf-8-sig')
ACCESS_KEY_ID = elo_s3_accessKeys['Access key ID'].values[0]
SECRET_ACCESS_KEY = elo_s3_accessKeys['Secret access key'].values[0]
bucket_name = "bucket-getaround"

In [3]:
# RDS database credentials
elo_rds_accessKeys = pd.read_csv("C:/Users/elodi/Documents/Documents/PYTHON/Jedha/data_science_full_stack/elo_rds_accessKeys.csv",encoding='utf-8-sig')
db_host = elo_rds_accessKeys['db_host'].values[0]
db_port = elo_rds_accessKeys['db_port'].values[0]
db_name = elo_rds_accessKeys['db_name'].values[0]
db_user = elo_rds_accessKeys['db_user'].values[0]
db_pass = elo_rds_accessKeys['db_pass'].values[0]

# Scraping Booking.com

In [None]:
class booking_spider(scrapy.Spider):

    name = "booking"
    start_urls = ['https://www.booking.com']
    best_cities = ['Saintes Maries de la mer', 'Biarritz', 'Bayonne', 'Collioure', 'St Malo']

    # Retrieve the page for each CITY
    def start_requests(self):
        checkin = "2023-07-11"
        checkout = "2023-07-16"
        # warning: url from GoogleChrome (doesnt work for Edge)
        # city_url_template = 'https://www.booking.com/searchresults.fr.html?ss={}&label=gen173nr-1BCAEoggI46AdIM1gEaE2IAQGYAQm4ARfIAQzYAQHoAQGIAgGoAgO4AsHWlqIGwAIB0gIkZTU5NjhmNWEtMTk1Yi00OGEyLThmNWYtYzU2ZDNmMDU5MWZm2AIF4AIB&sid=4044a316139dbb224d418fbb7843881c&aid=304142&lang=fr&sb=1&src_elem=sb&src=index&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure'
        city_url_template = 'https://www.booking.com/searchresults.fr.html?ss={}&label=gen173nr-1BCAEoggI46AdIM1gEaE2IAQGYAQm4ARfIAQzYAQHoAQGIAgGoAgO4AsHWlqIGwAIB0gIkZTU5NjhmNWEtMTk1Yi00OGEyLThmNWYtYzU2ZDNmMDU5MWZm2AIF4AIB&sid=4044a316139dbb224d418fbb7843881c&aid=304142&lang=fr&sb=1&src_elem=sb&src=index&dest_type=city&checkin={}&checkout={}&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
        
        for city in self.best_cities:
            city_url = city_url_template.format(city,checkin,checkout)
            yield scrapy.Request(city_url, headers=headers, callback=self.parse_hotels_with_urls, meta={'city':city, 'city_url': city_url})
    
    # Retrieve every HOTEL URL for a city
    def parse_hotels_with_urls(self, response):
        # Box corresponding to the hotel
        hotels = response.xpath("//h3[@class='a4225678b2']")
        # Inside this box, retrieve name and url
        for hotel in hotels:
            hotel_name = hotel.xpath(".//div[contains(@class, 'fcab3ed991 a23c043802')]/text()").get()
            hotel_url = hotel.xpath(".//@href").get()
            yield response.follow(hotel_url, callback=self.parse_hotels_details, meta={'city': response.meta['city'], 'hotel_name': hotel_name, 'hotel_url': hotel_url})
    
        # Loop over pages to retrive more hotels (offset set to 25 hotels max per page)
        hotel_ou_commencer = 25
        while hotel_ou_commencer < 1000:
            city_next_page_url = response.meta['city_url'] + f"&offset={hotel_ou_commencer}"
            yield response.follow(city_next_page_url, callback=self.parse_hotels_with_urls, meta=response.meta)
            hotel_ou_commencer += 25

    # Retrieve HOTEL INFORMATION from each hotel page
    def parse_hotels_details(self, response):
         yield {
              'city':response.meta['city'],
              'hotel_name': response.meta['hotel_name'],
              'hotel_url': response.meta['hotel_url'],
              'hotel_GPS': response.xpath("//@data-atlas-latlng").get(),
              'hotel_score': response.xpath("//div[@class='b5cd09854e d10a6220b4']/text()").get(),
              'hotel_voters':response.xpath("//div[@class='d8eab2cf7f c90c0a70d3 db63693c62']/text()").get(),
              'hotel_description': Selector(response).css('div.hp_desc_main_content').xpath('string()').get().strip()

         }
         
filename = "booking_scrap.json"

# If file already exists, remove it before
if filename in os.listdir('scrap_results/'):
        os.remove('scrap_results/' + filename)

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Chrome/97.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'scrap_results/' + filename : {"format": "json"},
    }
})

process.crawl(booking_spider)
process.start()

# Send to s3

In [5]:
# Merge API scraping of the weather (only the top5 cities) and hotels for those cities
df_booking_scrap = pd.read_json('scrap_results/booking_scrap.json')
df_top5cities_weather = pd.read_csv('scrap_results/forecast_weather_top5cities.csv')
df_merged = pd.merge(df_booking_scrap, df_top5cities_weather, on='city', how='left')

# Send to s3
df_merged.to_csv('scrap_results/forecast_weather_top5cities_and_hotels.csv', index=False)
df_merged.head()

Unnamed: 0,city,hotel_name,hotel_url,hotel_GPS,hotel_score,hotel_voters,hotel_description,feels_like,humidity,wind_speed,city_id,main_weather,main_weather_scores,final_score,lon,lat
0,Bayonne,Baiona Sainte Cath -1-Bat-,https://www.booking.com/hotel/fr/baiona-sainte...,"43.49503363,-1.46775819",56,18 expériences vécues,"Doté d'une connexion Wi-Fi gratuite, le Baiona...",21.218,77.925,2.4935,7.0,clear sky,5.0,624.0,-1.473666,43.494514
1,Bayonne,Péniche DJEBELLE,https://www.booking.com/hotel/fr/peniche-djebe...,"43.49633500,-1.47374900",95,227 expériences vécues,"Dotée d'un service de prêt de vélos, d'un salo...",21.218,77.925,2.4935,7.0,clear sky,5.0,624.0,-1.473666,43.494514
2,Bayonne,115 sqm triplex with panoramic terrace close t...,https://www.booking.com/hotel/fr/large-apartme...,"43.50806460,-1.48278620",70,1 expérience vécue,"Situé à Bayonne, à 25 km de la gare de Saint-J...",21.218,77.925,2.4935,7.0,clear sky,5.0,624.0,-1.473666,43.494514
3,Bayonne,Villa AJNA Bayonne,https://www.booking.com/hotel/fr/chambre-chino...,"43.48392100,-1.49821200",88,75 expériences vécues,"Située à Bayonne, la Villa AJNA Bayonne propos...",21.218,77.925,2.4935,7.0,clear sky,5.0,624.0,-1.473666,43.494514
4,Bayonne,KATALINA LAU T3 au coeur de Saint Esprit,https://www.booking.com/hotel/fr/katalina-lau-...,"43.49549431,-1.46897977",43,8 expériences vécues,Le KATALINA LAU T3 au coeur de Saint Esprit es...,21.218,77.925,2.4935,7.0,clear sky,5.0,624.0,-1.473666,43.494514


In [6]:
# Create an instance of boto3.Session that connects with my aws account
session = boto3.Session(aws_access_key_id=ACCESS_KEY_ID, aws_secret_access_key=SECRET_ACCESS_KEY)

# Create a variable s3 that connects my session to the s3 ressource
s3 = session.resource("s3")

# Create a variable that will connect to my existing bucket bucket-getaround
bucket_getaround = s3.Bucket(bucket_name) 

# Upload file to the s3 bucket
bucket_getaround.upload_file("scrap_results/forecast_weather_top5cities_and_hotels.csv","forecast_weather_top5cities_and_hotels.csv")

# ETL

### Extract

In [7]:
# Connect to my s3
s3_client = boto3.client('s3' , aws_access_key_id=ACCESS_KEY_ID , aws_secret_access_key=SECRET_ACCESS_KEY)

# Retrieve file from s3
s3_file_path = 'forecast_weather_top5cities_and_hotels.csv'
local_destination = 'data_from_s3/forecast_weather_top5cities_and_hotels.csv'
s3_client.download_file(bucket_name, s3_file_path, local_destination)

### Transform

In [8]:
# Clean data (drop lines etc)
df_from_s3 = pd.read_csv(local_destination)
df_from_s3.describe(include='all')

Unnamed: 0,city,hotel_name,hotel_url,hotel_GPS,hotel_score,hotel_voters,hotel_description,feels_like,humidity,wind_speed,city_id,main_weather,main_weather_scores,final_score,lon,lat
count,205,205,205,205,172.0,176,205,205.0,205.0,205.0,205.0,205,205.0,205.0,205.0,205.0
unique,5,139,205,137,43.0,89,151,,,,,1,,,,
top,St Malo,Hotel Cote Basque,https://www.booking.com/hotel/fr/baiona-sainte...,"43.49618306,-1.46960109",90.0,1 expérience vécue,L'hotelF1 Bayonne propose des hébergements à 7...,,,,,clear sky,,,,
freq,42,5,1,5,12.0,21,4,,,,,205,,,,
mean,,,,,,,,22.629465,72.904512,3.559033,17.921951,,5.0,620.931707,0.487466,44.346014
std,,,,,,,,3.503871,5.1055,1.045719,9.848049,,0.0,5.78836,2.715603,2.220766
min,,,,,,,,17.81475,65.15,2.4935,7.0,,5.0,610.0,-2.026041,42.52505
25%,,,,,,,,21.07775,68.85,2.61225,9.0,,5.0,621.0,-1.559278,43.452277
50%,,,,,,,,21.218,74.775,3.51125,14.0,,5.0,624.0,-1.473666,43.483252
75%,,,,,,,,25.54825,77.9,3.72,29.0,,5.0,624.0,3.083155,43.494514


In [9]:
# Remove lines with missing info
df_from_s3=df_from_s3.dropna()
df_from_s3.info()
# Convert values in hotel_score and hotel_voters in float and integer
df_from_s3['hotel_score'] = df_from_s3['hotel_score'].str.replace(',', '.').astype(float)
df_from_s3['hotel_voters'] = df_from_s3['hotel_voters'].apply(lambda x: x.split()[0]).astype(int)
# Divide hotel_GPS into 2 columns, convert to float
df_from_s3[['hotel_lat', 'hotel_lon']] = df_from_s3['hotel_GPS'].str.split(',', expand=True).astype(float)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 0 to 204
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   city                 170 non-null    object 
 1   hotel_name           170 non-null    object 
 2   hotel_url            170 non-null    object 
 3   hotel_GPS            170 non-null    object 
 4   hotel_score          170 non-null    object 
 5   hotel_voters         170 non-null    object 
 6   hotel_description    170 non-null    object 
 7   feels_like           170 non-null    float64
 8   humidity             170 non-null    float64
 9   wind_speed           170 non-null    float64
 10  city_id              170 non-null    float64
 11  main_weather         170 non-null    object 
 12  main_weather_scores  170 non-null    float64
 13  final_score          170 non-null    float64
 14  lon                  170 non-null    float64
 15  lat                  170 non-null    flo

In [10]:
df_from_s3.sort_values(by='hotel_score',ascending=False)
print("We notice that hotel_score of 10 has sometimes only 1 or 2 voters, which might not be sufficient for judgement.")
print("Number of hotels: ",len(df_from_s3))
to_keep = (df_from_s3['hotel_voters'] >= 5)
df_transformed = df_from_s3.loc[to_keep]
print("Number of hotels with more than 5 voters: ",len(df_transformed))

We notice that hotel_score of 10 has sometimes only 1 or 2 voters, which might not be sufficient for judgement.
Number of hotels:  170
Number of hotels with more than 5 voters:  98


In [11]:
df_transformed.columns

Index(['city', 'hotel_name', 'hotel_url', 'hotel_GPS', 'hotel_score',
       'hotel_voters', 'hotel_description', 'feels_like', 'humidity',
       'wind_speed', 'city_id', 'main_weather', 'main_weather_scores',
       'final_score', 'lon', 'lat', 'hotel_lat', 'hotel_lon'],
      dtype='object')

In [12]:
# Define how should look the final table for managers

# Let's instanciate a declarative base to be able to use our python class
Base = declarative_base()

class Scraping(Base):
    # d'abord on doit donner un nom à notre table (obligé d'écrire tablename comme ca)
    __tablename__ = "Hotels"

    # Each parameter corresponds to a column in our DB table
    id = Column(Integer, primary_key=True, autoincrement=True)
    city = Column(String)
    hotel_name = Column(String)
    hotel_url = Column(String)
    hotel_score = Column(Float)
    hotel_voters = Column(Integer)
    hotel_description = Column(String)
    feels_like = Column(Float)
    humidity = Column(Float)
    wind_speed = Column(Float)
    city_id = Column(Float)
    main_weather = Column(String)
    main_weather_scores = Column(Float)
    final_score = Column(Float)
    lon = Column(Float)
    lat = Column(Float)
    hotel_lat = Column(Float)
    hotel_lon = Column(Float)

    def __repr__(self):
    # obligé de l'écrire comme ca exactement
        # return "<User(name='{}', fullname='{}', nickname='{}')>".format(self.name, self.fullname, self.nickname)
        columns = [column.name for column in Scraping.__table__.columns]
        values = [getattr(self, column) for column in columns]
        values_str = ', '.join([f'{column}={value}' for column, value in zip(columns, values)])
        return f"<Scraping({values_str})>"

### Load

In [13]:
# Create a SQLAlchemy engine
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}")

In [14]:
Base.metadata.create_all(engine)
# Creates the database schema based on our SQLAlchemy model definition (so the Base instance with the data 
# associated, like the table Hotels)

# In PGadmin, refresh then go to: project-getaround-db/Databases/postgres/Schema/public/Tables/Hotels
# Command QUERY TOOL to code in SQL

In [15]:
# Add all the data from the dataset
df_transformed.to_sql("Hotels", engine, if_exists="replace", index=False)

98

# Pick up best hotels

In [16]:
# Imagine now we are someone else from the company, querying the created SQL database to find out what are
# 20 best hotels for a 5 days trip in one of the 5top cities.

# The person would first create a new engine and a new session
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}")
Session = sessionmaker(bind=engine)
session = Session()

In [17]:
# Do we have comparable number of hotels for the 5 cities ?

query = """
SELECT city, COUNT(DISTINCT hotel_name) AS num_hotel_names
FROM public."Hotels"
GROUP BY city
"""

hotels_per_city = pd.read_sql(query, engine)
display(hotels_per_city)

print("There are quite similar number of hotels for each city.")

Unnamed: 0,city,num_hotel_names
0,Bayonne,20
1,Biarritz,19
2,Collioure,11
3,Saintes Maries de la mer,31
4,St Malo,17


There are quite similar number of hotels for each city.


In [18]:
# Find the 20 best hotels

query = """
SELECT city, hotel_name, hotel_score, hotel_voters, hotel_lat, hotel_lon
FROM public."Hotels"
ORDER BY hotel_score DESC
LIMIT 20
"""

# We need quotes for hotel_GPS because of the capital letters creating problem

top20_hotels = pd.read_sql(query, engine)
display(top20_hotels)

Unnamed: 0,city,hotel_name,hotel_score,hotel_voters,hotel_lat,hotel_lon
0,Saintes Maries de la mer,Maison Diderot,9.6,5,43.567637,4.192471
1,Saintes Maries de la mer,Péniche DJEBELLE,9.5,227,43.496335,-1.473749
2,Bayonne,Péniche DJEBELLE,9.5,227,43.496335,-1.473749
3,Saintes Maries de la mer,Appartements 3 étoiles terrasse ou patio intra...,9.4,37,43.565358,4.19275
4,St Malo,Villa Marlotte by Cocoonr - La Belle de Dinard,9.4,8,48.623214,-2.048115
5,Biarritz,Villa la Renaissance,9.2,337,43.482321,-1.468164
6,Bayonne,Villa la Renaissance,9.2,337,43.482321,-1.468164
7,Saintes Maries de la mer,Pumperhouse,9.1,12,43.642877,4.434104
8,Saintes Maries de la mer,Lodge Sainte Helene,9.1,121,43.465756,4.413934
9,Saintes Maries de la mer,Mas le Sauvageon,9.1,219,43.483248,4.390455


In [19]:
fig = px.scatter_mapbox(
    top20_hotels, 
    title="Top 20 hotels for our kite surfing trip", 
    lat="hotel_lat", 
    lon="hotel_lon", 
    zoom=5, 
    mapbox_style="open-street-map",
    size="hotel_score",
    color = 'city',
    hover_name= 'hotel_name',
    hover_data = ['city', 'hotel_score','hotel_voters'],

)


fig.show()

# To show picture in Github (not interactive)
image = Image.open('final pictures/top5_cities_picture.jpg')
image.show()

![alt text](https://raw.githubusercontent.com/elodiesune/PROJECT_Kayak/main/final%20pictures/top20_hotels_picture.jpg)

In [20]:
# Close the session
session.close()