# Necessary libraries and credentials

In [1]:
import os 
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import json
import re

import boto3
import pandas as pd
from sqlalchemy import create_engine, text, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image

import psycopg2

In [2]:
# s3 credentials
elo_s3_accessKeys = pd.read_csv("C:/Users/elodi/Documents/Documents/PYTHON/Jedha/data_science_full_stack/elo_s3_accessKeys.csv",encoding='utf-8-sig')
ACCESS_KEY_ID = elo_s3_accessKeys['Access key ID'].values[0]
SECRET_ACCESS_KEY = elo_s3_accessKeys['Secret access key'].values[0]
bucket_name = "bucket-getaround"

In [3]:
# RDS database credentials
elo_rds_accessKeys = pd.read_csv("C:/Users/elodi/Documents/Documents/PYTHON/Jedha/data_science_full_stack/elo_rds_accessKeys.csv",encoding='utf-8-sig')
db_host = elo_rds_accessKeys['db_host'].values[0]
db_port = elo_rds_accessKeys['db_port'].values[0]
db_name = elo_rds_accessKeys['db_name'].values[0]
db_user = elo_rds_accessKeys['db_user'].values[0]
db_pass = elo_rds_accessKeys['db_pass'].values[0]

# Scraping Booking.com

In [None]:
class booking_spider(scrapy.Spider):

    name = "booking"
    start_urls = ['https://www.booking.com']
    best_cities = ['La Rochelle', 'Le Havre', 'Saintes Maries de la mer', 'Biarritz', 'Bayonne']

    # Retrieve the page for each CITY
    def start_requests(self):
        checkin = "2023-07-20"
        checkout = "2023-07-25"
        # warning: url from GoogleChrome (doesnt work for Edge)
        # city_url_template = 'https://www.booking.com/searchresults.fr.html?ss={}&label=gen173nr-1BCAEoggI46AdIM1gEaE2IAQGYAQm4ARfIAQzYAQHoAQGIAgGoAgO4AsHWlqIGwAIB0gIkZTU5NjhmNWEtMTk1Yi00OGEyLThmNWYtYzU2ZDNmMDU5MWZm2AIF4AIB&sid=4044a316139dbb224d418fbb7843881c&aid=304142&lang=fr&sb=1&src_elem=sb&src=index&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure'
        city_url_template = 'https://www.booking.com/searchresults.fr.html?ss={}&label=gen173nr-1BCAEoggI46AdIM1gEaE2IAQGYAQm4ARfIAQzYAQHoAQGIAgGoAgO4AsHWlqIGwAIB0gIkZTU5NjhmNWEtMTk1Yi00OGEyLThmNWYtYzU2ZDNmMDU5MWZm2AIF4AIB&sid=4044a316139dbb224d418fbb7843881c&aid=304142&lang=fr&sb=1&src_elem=sb&src=index&dest_type=city&checkin={}&checkout={}&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    
        for city in self.best_cities:
            city_url = city_url_template.format(city,checkin,checkout)
            yield scrapy.Request(city_url, headers=headers, callback=self.parse_hotels_with_urls, meta={'city':city, 'city_url': city_url})
    
    # Retrieve every HOTEL URL for a city
    def parse_hotels_with_urls(self, response):
        # Box corresponding to the hotel
        hotels = response.xpath("//h3[@class='a4225678b2']")
        # Inside this box, retrieve name and url
        for hotel in hotels:
            hotel_name = hotel.xpath(".//div[contains(@class, 'fcab3ed991 a23c043802')]/text()").get()
            hotel_url = hotel.xpath(".//@href").get()
            yield response.follow(hotel_url, callback=self.parse_hotels_details, meta={'city': response.meta['city'], 'hotel_name': hotel_name, 'hotel_url': hotel_url})
    
        # Loop over pages to retrive more hotels (offset set to 25 hotels max per page)
        hotel_ou_commencer = 25
        while hotel_ou_commencer < 1000:
            city_next_page_url = response.meta['city_url'] + f"&offset={hotel_ou_commencer}"
            yield response.follow(city_next_page_url, callback=self.parse_hotels_with_urls, meta=response.meta)
            hotel_ou_commencer += 25

    # Retrieve HOTEL INFORMATION from each hotel page
    def parse_hotels_details(self, response):
         yield {
              'city':response.meta['city'],
              'hotel_name': response.meta['hotel_name'],
              'hotel_url': response.meta['hotel_url'],
              'hotel_GPS': response.xpath("//@data-atlas-latlng").get(),
              'hotel_score': response.xpath("//div[@class='b5cd09854e d10a6220b4']/text()").get(),
              'hotel_voters':response.xpath("//div[@class='d8eab2cf7f c90c0a70d3 db63693c62']/text()").get(),
              'hotel_description': Selector(response).css('div.hp_desc_main_content').xpath('string()').get().strip(),
              'hotel_address': response.xpath("//span[contains(@class, 'hp_address_subtitle')]/text()").get()

         }
         
filename = "booking_scrap.json"

# If file already exists, remove it before
if filename in os.listdir('scrap_results/'):
        os.remove('scrap_results/' + filename)

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Chrome/97.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'scrap_results/' + filename : {"format": "json"},
    }
})

process.crawl(booking_spider)
process.start()

In [29]:
df_booking_scrap = pd.read_json('scrap_results/booking_scrap.json')
df_booking_scrap.city.value_counts()

Saintes Maries de la mer    80
La Rochelle                 71
Biarritz                    50
Le Havre                    37
Bayonne                     27
Name: city, dtype: int64

In [30]:
# For some cities where there are only few hotels, booking propose more hotels in other close cities. 
# We want to remove them. From the address of the hotel, we can extract the city name, and remove all lines 
# which don't correlate with the city we were interested in.


# Function which will extract the city name from the address using regex
def extract_city_name(address):
    pattern = r"\b\d{5}\b\s+(.*?)(?:,|$)"
    match = re.search(pattern, address)
    if match:
        return match.group(1).strip()
    else:
        return "City name not found."

# Apply the function to the "hotel_address" column, create a new column "hotel_city", remove hotel_address
df_booking_scrap['hotel_city'] = df_booking_scrap['hotel_address'].apply(extract_city_name)
df_booking_scrap.drop(columns=['hotel_address'], inplace=True)
# Remove from dataframe all hotels with other cities
df_booking_scrap['hotel_city'] = df_booking_scrap['hotel_city'].replace("Les Saintes-Maries-de-la-Mer", "Saintes Maries de la mer")
df_booking_scrap = df_booking_scrap[df_booking_scrap['city'] == df_booking_scrap['hotel_city']]
df_booking_scrap.reset_index(drop=True, inplace=True)
# Check up
cols = df_booking_scrap.columns.tolist()
cols.insert(1, cols.pop(cols.index('hotel_city'))) # move hotel_city next to city
df_booking_scrap = df_booking_scrap[cols]
# pd.set_option('display.max_rows', None)
# display(df_booking_scrap)
# df_booking_scrap.city.value_counts()

# Remove column and save again the csv
df_booking_scrap.drop(columns=['hotel_city'], inplace=True)
df_booking_scrap.to_json("scrap_results/booking_scrap.json", orient="records", lines=True)

# Send to s3

In [33]:
# Merge API scraping of the weather (only the top5 cities) and hotels for those cities
df_top5cities_weather = pd.read_csv('scrap_results/forecast_weather_top5cities.csv')
df_merged = pd.merge(df_booking_scrap, df_top5cities_weather, on='city', how='left')

# Into csv file
df_merged.to_csv('scrap_results/forecast_weather_top5cities_and_hotels.csv', index=False)
df_merged.head()

Unnamed: 0,city,hotel_name,hotel_url,hotel_GPS,hotel_score,hotel_voters,hotel_description,feels_like,humidity,wind_speed,city_id,main_weather,main_weather_scores,final_score,lon,lat
0,Biarritz,Le Petit Hôtel,https://www.booking.com/hotel/fr/le-petit-hote...,"43.48286503,-1.55950472",74,335 expériences vécues,"Le Petit Hôtel est situé au cœur de Biarritz, ...",19.65525,75.55,2.596,9.0,scattered clouds,3.0,585.0,-1.559278,43.483252
1,Biarritz,Osuna Txiki Btz,https://www.booking.com/hotel/fr/ulia.fr.html?...,"43.48303130,-1.55653060",72,4 expériences vécues,"Situé à Biarritz, à 600 mètres de la plage des...",19.65525,75.55,2.596,9.0,scattered clouds,3.0,585.0,-1.559278,43.483252
2,Biarritz,Atlantico - Arrosa,https://www.booking.com/hotel/fr/atlantico-arr...,"43.47610700,-1.56114500",83,57 expériences vécues,Vous pouvez bénéficier d'une réduction Genius ...,19.65525,75.55,2.596,9.0,scattered clouds,3.0,585.0,-1.559278,43.483252
3,Biarritz,Biarritz Surf Lodge Maison d'hôtes,https://www.booking.com/hotel/fr/biarritz-surf...,"43.46749490,-1.56837520",87,12 expériences vécues,Situé à 500 mètres de la plage de la Côte des ...,19.65525,75.55,2.596,9.0,scattered clouds,3.0,585.0,-1.559278,43.483252
4,Biarritz,Escale Oceania Biarritz,https://www.booking.com/hotel/fr/altess-hotel....,"43.48534146,-1.55238748",79,529 expériences vécues,Vous pouvez bénéficier d'une réduction Genius ...,19.65525,75.55,2.596,9.0,scattered clouds,3.0,585.0,-1.559278,43.483252


In [34]:
# Create an instance of boto3.Session that connects with my aws account
session = boto3.Session(aws_access_key_id=ACCESS_KEY_ID, aws_secret_access_key=SECRET_ACCESS_KEY)

# Create a variable s3 that connects my session to the s3 ressource
s3 = session.resource("s3")

# Create a variable that will connect to my existing bucket bucket-getaround
bucket_getaround = s3.Bucket(bucket_name) 

# Upload file to the s3 bucket
bucket_getaround.upload_file("scrap_results/forecast_weather_top5cities_and_hotels.csv","forecast_weather_top5cities_and_hotels.csv")

# ETL

### Extract

In [6]:
# Connect to my s3
s3_client = boto3.client('s3' , aws_access_key_id=ACCESS_KEY_ID , aws_secret_access_key=SECRET_ACCESS_KEY)

# Retrieve file from s3
s3_file_path = 'forecast_weather_top5cities_and_hotels.csv'
local_destination = 'data_from_s3/forecast_weather_top5cities_and_hotels.csv'
s3_client.download_file(bucket_name, s3_file_path, local_destination)

### Transform

In [7]:
# Clean data (drop lines etc)
df_from_s3 = pd.read_csv(local_destination)
df_from_s3.describe(include='all')

Unnamed: 0,city,hotel_name,hotel_url,hotel_GPS,hotel_score,hotel_voters,hotel_description,feels_like,humidity,wind_speed,city_id,main_weather,main_weather_scores,final_score,lon,lat
count,204,204,204,204,185.0,186,204,204.0,204.0,204.0,204.0,204,204.0,204.0,204.0,204.0
unique,5,183,204,182,49.0,141,185,,,,,3,,,,
top,La Rochelle,The People Le Havre,https://www.booking.com/hotel/fr/le-petit-hote...,"46.14560524,-1.16130704",83.0,1 expérience vécue,Vous pouvez bénéficier d'une réduction Genius ...,,,,,clear sky,,,,
freq,71,2,1,2,15.0,12,2,,,,,90,,,,
mean,,,,,,,,19.379262,73.371691,4.374222,16.073529,,3.519608,593.931373,-0.546114,45.503328
std,,,,,,,,2.596128,6.123455,1.419783,6.687205,,1.4937,8.78227,1.700017,2.233494
min,,,,,,,,16.97875,55.7,2.51275,7.0,,1.0,584.0,-1.559278,43.452277
25%,,,,,,,,18.3225,72.675,2.596,9.0,,3.0,585.0,-1.473666,43.483252
50%,,,,,,,,18.3225,75.55,5.3565,19.0,,3.0,598.0,-1.152043,46.159113
75%,,,,,,,,19.65525,75.55,5.3565,20.0,,5.0,604.0,0.107973,46.159113


In [None]:
# Check duplicates
duplicate_count = df_from_s3.duplicated(subset=['city', 'hotel_name']).sum()
print("Number of duplicates:", duplicate_count)
duplicates_df = df_from_s3[df_from_s3.duplicated(subset=['city', 'hotel_name'], keep=False)]
sorted_duplicates_df = duplicates_df.sort_values(by=['city', 'hotel_name'])
# print("DataFrame with Duplicates:")
# print(sorted_duplicates_df)

# Remove them (keep the first one)
df_from_s3.drop_duplicates(subset=['city', 'hotel_name'], keep='first', inplace=True)


In [12]:
# Remove lines with missing info
df_from_s3=df_from_s3.dropna()
df_from_s3.info()
# Convert values in hotel_score and hotel_voters in float and integer
df_from_s3['hotel_score'] = df_from_s3['hotel_score'].str.replace(',', '.').astype(float)
df_from_s3['hotel_voters'] = df_from_s3['hotel_voters'].apply(lambda x: x.split()[0]).astype(int)
# Divide hotel_GPS into 2 columns, convert to float
df_from_s3[['hotel_lat', 'hotel_lon']] = df_from_s3['hotel_GPS'].str.split(',', expand=True).astype(float)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 163 entries, 0 to 203
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   city                 163 non-null    object 
 1   hotel_name           163 non-null    object 
 2   hotel_url            163 non-null    object 
 3   hotel_GPS            163 non-null    object 
 4   hotel_score          163 non-null    object 
 5   hotel_voters         163 non-null    object 
 6   hotel_description    163 non-null    object 
 7   feels_like           163 non-null    float64
 8   humidity             163 non-null    float64
 9   wind_speed           163 non-null    float64
 10  city_id              163 non-null    float64
 11  main_weather         163 non-null    object 
 12  main_weather_scores  163 non-null    float64
 13  final_score          163 non-null    float64
 14  lon                  163 non-null    float64
 15  lat                  163 non-null    flo

In [13]:
df_from_s3.sort_values(by='hotel_score',ascending=False)
print("We notice that hotel_score of 10 has sometimes only 1 or 2 voters, which might not be sufficient for judgement.")
print("Number of hotels: ",len(df_from_s3))
to_keep = (df_from_s3['hotel_voters'] >= 5)
df_transformed = df_from_s3.loc[to_keep]
print("Number of hotels with more than 5 voters: ",len(df_transformed))

We notice that hotel_score of 10 has sometimes only 1 or 2 voters, which might not be sufficient for judgement.
Number of hotels:  163
Number of hotels with more than 5 voters:  90


In [14]:
df_transformed.columns

Index(['city', 'hotel_name', 'hotel_url', 'hotel_GPS', 'hotel_score',
       'hotel_voters', 'hotel_description', 'feels_like', 'humidity',
       'wind_speed', 'city_id', 'main_weather', 'main_weather_scores',
       'final_score', 'lon', 'lat', 'hotel_lat', 'hotel_lon'],
      dtype='object')

In [15]:
# Define how should look the final table for managers

# Let's instanciate a declarative base to be able to use our python class
Base = declarative_base()

class Scraping(Base):
    # d'abord on doit donner un nom à notre table (obligé d'écrire tablename comme ca)
    __tablename__ = "Hotels"

    # Each parameter corresponds to a column in our DB table
    id = Column(Integer, primary_key=True, autoincrement=True)
    city = Column(String)
    hotel_name = Column(String)
    hotel_url = Column(String)
    hotel_score = Column(Float)
    hotel_voters = Column(Integer)
    hotel_description = Column(String)
    feels_like = Column(Float)
    humidity = Column(Float)
    wind_speed = Column(Float)
    city_id = Column(Float)
    main_weather = Column(String)
    main_weather_scores = Column(Float)
    final_score = Column(Float)
    lon = Column(Float)
    lat = Column(Float)
    hotel_lat = Column(Float)
    hotel_lon = Column(Float)

    def __repr__(self):
    # obligé de l'écrire comme ca exactement
        # return "<User(name='{}', fullname='{}', nickname='{}')>".format(self.name, self.fullname, self.nickname)
        columns = [column.name for column in Scraping.__table__.columns]
        values = [getattr(self, column) for column in columns]
        values_str = ', '.join([f'{column}={value}' for column, value in zip(columns, values)])
        return f"<Scraping({values_str})>"

### Load

In [16]:
# Create a SQLAlchemy engine
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}")

In [17]:
Base.metadata.create_all(engine)
# Creates the database schema based on our SQLAlchemy model definition (so the Base instance with the data 
# associated, like the table Hotels)

# In PGadmin, refresh then go to: project-getaround-db/Databases/postgres/Schema/public/Tables/Hotels
# Command QUERY TOOL to code in SQL

In [18]:
# Add all the data from the dataset
df_transformed.to_sql("Hotels", engine, if_exists="replace", index=False)

90

# Pick up best hotels

In [19]:
# Imagine now we are someone else from the company, querying the created SQL database to find out what are
# 20 best hotels for a 5 days trip in one of the 5top cities.

# The person would first create a new engine and a new session
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}")
Session = sessionmaker(bind=engine)
session = Session()

In [20]:
# Do we have comparable number of hotels for the 5 cities ?

query = """
SELECT city, COUNT(DISTINCT hotel_name) AS num_hotel_names
FROM public."Hotels"
GROUP BY city
"""

hotels_per_city = pd.read_sql(query, engine)
display(hotels_per_city)

print("There are quite similar number of hotels for each city.")

Unnamed: 0,city,num_hotel_names
0,Bayonne,14
1,Biarritz,17
2,La Rochelle,22
3,Le Havre,18
4,Saintes Maries de la mer,19


There are quite similar number of hotels for each city.


In [21]:
# Find the 20 best hotels

query = """
SELECT city, hotel_name, hotel_score, hotel_voters, hotel_lat, hotel_lon
FROM public."Hotels"
ORDER BY hotel_score DESC
LIMIT 20
"""

# We need quotes for hotel_GPS because of the capital letters creating problem

top20_hotels = pd.read_sql(query, engine)
display(top20_hotels)

Unnamed: 0,city,hotel_name,hotel_score,hotel_voters,hotel_lat,hotel_lon
0,La Rochelle,Verdun 2,9.8,5,46.16326,-1.152714
1,Saintes Maries de la mer,"Mas du Couvin, maison d'hôtes en Camargue",9.7,80,43.500389,4.427241
2,Le Havre,Bel appartement centre Le Havre,9.4,13,49.493963,0.118168
3,La Rochelle,Appartement neuf au cœur de La Rochelle,9.3,11,46.1582,-1.15454
4,La Rochelle,La Rochelle sur L'eau,9.3,224,46.145605,-1.161307
5,Bayonne,Villa la Renaissance,9.2,348,43.482321,-1.468164
6,Biarritz,Hotel Saint Julien,9.1,440,43.479139,-1.562275
7,Saintes Maries de la mer,Mas le Sauvageon,9.1,225,43.483248,4.390455
8,Saintes Maries de la mer,Lodge Sainte Helene,9.1,122,43.465756,4.413934
9,Biarritz,Appartement Halles de Biarritz,9.0,6,43.479954,-1.561656


In [22]:
fig = px.scatter_mapbox(
    top20_hotels, 
    title="Top 20 hotels for our kite surfing trip", 
    lat="hotel_lat", 
    lon="hotel_lon", 
    zoom=5, 
    mapbox_style="open-street-map",
    size="hotel_score",
    color = 'city',
    hover_name= 'hotel_name',
    hover_data = ['city', 'hotel_score','hotel_voters'],

)


fig.show()
fig.write_html("final pictures/top_20_hotels_interactive.html")

![alt text](https://raw.githubusercontent.com/elodiesune/PROJECT_Kayak/main/final%20pictures/top20_hotels_picture.jpg)

In [23]:
# Close the session
session.close()