# Scraping web - Airlines

Ejemplo de web scraping en flightradar

### IMPORTACION DE LIBRERÍAS

In [13]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import requests
import os

### REQUEST

In [14]:
url = 'https://www.flightradar24.com/data/airlines'
headers = {'User-Agent': 'Mozilla/5.0'}

In [15]:
req = Request(url, headers=headers)
raw_web = urlopen(req).read()
soup = BeautifulSoup(raw_web, "html.parser")
table = soup.find_all("tbody")

### FUNCIONES

In [16]:
def divide_cods(template):
    if '/' in template[2]:
        template[2:3] = template[2].split(' / ')     # Caso 1: IATA + ICAO
    elif len(template[2]) == 2:
        template[2:3] = [template[2], None]          # Caso 2: IATA + NULL
    elif len(template[2]) == 3:
        template[2:3] = [None, template[2]]          # Caso 3: NULL + ICAO
    else:
        template[2:4] = [None, None]                 # Caso 4: NULL + NULL
    return template

In [17]:
def image_links():
    # Ajusta el índice para encontrar la imagen dentro de la primera columna (<td>)
    img_tags = cols[0].find_all('img')
    
    # Verifica si hay al menos una etiqueta <img>
    if img_tags:
        image_url = img_tags[0]['data-bn-lazy-src'] if 'data-bn-lazy-src' in img_tags[0].attrs else img_tags[0]['src']
        template.append(image_url)
    else:
        template.append(None)  # cadena vacia si no existe imagen
    

### DATA TABLE

In [18]:
table_data = []

for table_body in table:
        rows = table_body.find_all('tr', class_=lambda x: x != 'header')[1:]  # exclusion headers
for row in rows:
        cols = row.find_all('td')[1:5]                                    
        template = [col.text.strip() for col in cols]
        template[3] = template[3].replace(' aircraft','')                 # limpieza del campo 'numero de aeronaves'
        divide_cods(template)                                             # funcion divide string IATA/ICAO
        image_links()                                                     # funcion logo links
        template.pop(0)
        table_data.append(template)


### DATAFRAME

In [19]:
table_headers = ['aircraft_name','cod_IATA', 'cod_ICAO', 'aircraft_num', 'logo_link']
df = pd.DataFrame(table_data, columns=table_headers)
df

Unnamed: 0,aircraft_name,cod_IATA,cod_ICAO,aircraft_num,logo_link
0,21 Air,2I,CSB,2,https://images.flightradar24.com/assets/airlin...
1,2Excel Aviation,,BRO,25,https://images.flightradar24.com/assets/airlin...
2,748 Air Services,FE,IHO,5,https://images.flightradar24.com/assets/airlin...
3,9 Air,AQ,JYH,23,https://images.flightradar24.com/assets/airlin...
4,Abakan Air,S5,NKP,9,
...,...,...,...,...,...
1655,Zil Air,,SYZ,1,
1656,Zimex Aviation,XM,IMX,20,https://images.flightradar24.com/assets/airlin...
1657,Zipair,ZG,TZP,8,https://images.flightradar24.com/assets/airlin...
1658,Zoom Air,ZO,ZOM,1,


### DESCARGA DE IMÁGENES

In [20]:
respuesta = input("Deseas descargar todas las imagenes? [Y / N]")
if respuesta.upper() == 'Y':
    directorio_destino = f"{Path.home()}/Desktop/ImagenesLogosAirlines"
    
    if not os.path.exists(directorio_destino):      # Verificar si el directorio existe y si no, se crea
        os.makedirs(directorio_destino)
        
    for index, row in df.iterrows():
        if row['logo_link']:
            url = row['logo_link']
            nombre_archivo = f"{row['aircraft_name']}.png"  # Usa el nombre correspondiente de la columna 'aircraft_name'
            ruta_completa = os.path.join(directorio_destino, nombre_archivo)
            response = requests.get(url)
            if response.status_code == 200:
                with open(ruta_completa, 'wb') as f:
                    f.write(response.content)
                print(f"Imagen descargada y guardada como {ruta_completa}")
            else:
                print(f"No se pudo descargar la imagen desde {url}")
else:
    pass       

