In [31]:
#imports
import csv
import json
import requests
import traceback
import pandas as pd 
from bs4 import BeautifulSoup
from collections import defaultdict 
from requests.exceptions import JSONDecodeError

1. Implement a pageload to get the JSON data with your scraper.

In [32]:
# Creating function to scrape data based on set parameters, then it creates json file based on the from/to text and depart date/return date.
def flight_save_data(from_city, to_city, depart_date, return_date):
    url = "http://homeworktask.infare.lt/search.php"
    params = {
        "from": from_city, 
        "to": to_city, 
        "depart": depart_date, 
        "return": return_date
        }
    
    try:
        response = requests.get(url, params=params)
        data = response.json()
        
        file_name = f"{from_city}-{to_city}_{depart_date}_to_{return_date}.json"
        with open(file_name, 'w') as file:
            json.dump(data, file, indent=4)
            
        print(f"Data saved to {file_name}")
        
        return data
    
    except JSONDecodeError: 
        print(f"Route not available")
        
        return None
              


# using bs4 to scrape from/to options
r = requests.get(f'http://homeworktask.infare.lt/')
html_file = r.text
soup = BeautifulSoup(html_file, "html.parser")

# Get all from and to options, added option - if option["value"] which checks if value even exists
from_options = [option["value"] for option in soup.find("select", id="from").find_all("option") if option["value"]]
to_options = [option["value"] for option in soup.find("select", id="to").find_all("option") if option["value"]]

# select depart/return dates.
depart_date = "2024-03-09"
return_date =  "2024-03-23"



""" Commenting as task does not require to scrape all flights json data.
# let's create a loop to go through each flight and extract information.
json_data_list = [] #creating empty list to store scraped json data
for from_city in from_options:
    for to_city in to_options:
        if from_city != to_city:
            json_data = flight_save_data(from_city, to_city, depart_date, return_date)
            if json_data:
                json_data_list.append(json_data)
"""               

" Commenting as task does not require to scrape all flights json data.\n# let's create a loop to go through each flight and extract information.\njson_data_list = [] #creating empty list to store scraped json data\nfor from_city in from_options:\n    for to_city in to_options:\n        if from_city != to_city:\n            json_data = flight_save_data(from_city, to_city, depart_date, return_date)\n            if json_data:\n                json_data_list.append(json_data)\n"

2. Extract outbound and inbound flight data flying from MAD to AUH. You may choose any dates.

In [33]:
# we can re-use our function flight_save_data to gather required data
# it also creates a json file with the given parameters.

data = flight_save_data("MAD","AUH", depart_date, return_date)



Data saved to MAD-AUH_2024-03-09_to_2024-03-23.json


3. Make outbound and Inbound flight combinations for each price category (roundtrip flights).

In [34]:
# We need to extract all fare families and their pricings, dynamically so we get all prices & all journeys
# Extract the journeys
journeys = data['body']['data']['journeys']
avails = data['body']['data']['totalAvailabilities']
fare_families = {}

for journey in journeys:
    recommend_id = journey['recommendationId']
    fare_family = journey['fareFamily']['description']
    direction = journey['direction']
    for other_journey in journeys:
        # We find every journey with 2 similar fare families, then divide the base fare by 2 to get the price of 1 fare family
        if journey['direction'] == 'I' and other_journey['direction'] == 'V' and recommend_id == other_journey['recommendationId']:
            for avail in avails:
                if avail['recommendationId'] == recommend_id:
                    base_fare = round(avail['total'] - journey['importTaxAdl'] - other_journey['importTaxAdl'])
                
            if fare_family == other_journey['fareFamily']['description']:
                fare_families[fare_family] = round(base_fare/2, 2)
    
print(fare_families)

{'STANDARD': 179.5, 'FLEX': 2006.5}


4. Extract all available prices and calculate taxes for each combination.
* Data can be saved into a CSV file for examination.

In [35]:
# Prepare a list to store the combinations and also combination IDs to group each outbound-inbound combination
combinations = []
combination_ids = {}

i = 1
for journey in journeys:
    if journey['direction'] == 'I':  # Ensures we're looking at an outbound journey
        for other_journey in journeys:
            if other_journey['direction'] == 'V':  # Ensures we're comparing with an inbound journey
                if journey['direction'] == 'I' and other_journey['direction'] == 'V':
                    outbound_price = fare_families[journey['fareFamily']['description']]
                    inbound_price = fare_families[other_journey['fareFamily']['description']]
                    combined_price = outbound_price + inbound_price
                    
                    total_outbound_tax = journey['importTaxAdl'] + journey['importTaxChd'] + journey['importTaxInf']
                    total_inbound_tax = other_journey['importTaxAdl'] + other_journey['importTaxChd'] + other_journey['importTaxInf']
                    total_tax = round(total_outbound_tax + total_inbound_tax, 2)
                    
                    total_price = round(combined_price + total_tax, 2)
                    
                    outbound_dep_airport = journey['flights'][0]['airportDeparture']['code']
                    outbound_arr_airport = journey['flights'][0]['airportArrival']['code']
                    
                    outbound_departure = journey['flights'][0]['dateDeparture']
                    outbound_arrival = journey['flights'][0]['dateArrival']
                    
                    outbound_flight = journey['flights'][0]['number']
                    outbound_company = journey['flights'][0]['companyCode']
                    
                    outbound_flight_code = f'{outbound_company}{outbound_flight}'
                    
                    outbound_stops = len(journey['flights']) - 1
                    
                    if outbound_stops == 1:
                        outbound2_dep_airport = journey['flights'][-1]['airportDeparture']['code']
                        outbound2_arr_airport = journey['flights'][-1]['airportArrival']['code']
                    
                        outbound2_departure = journey['flights'][-1]['dateDeparture']
                        outbound2_arrival = journey['flights'][-1]['dateArrival']
                                    
                        outbound2_flight = journey['flights'][-1]['number']
                        outbound2_company = journey['flights'][-1]['companyCode']

                        outbound2_flight_code = f'{outbound2_company}{outbound2_flight}'
                    
                    elif outbound_stops == 0:
                        outbound2_dep_airport = ''
                        outbound2_arr_airport = ''
                    
                        outbound2_departure = ''
                        outbound2_arrival = ''

                        outbound2_flight_code = ''
                        
                    else:
                        continue
                    
                        
                    inbound_dep_airport = other_journey['flights'][0]['airportDeparture']['code']
                    inbound_arr_airport = other_journey['flights'][0]['airportArrival']['code']
                    
                    inbound_departure = other_journey['flights'][0]['dateDeparture']
                    inbound_arrival = other_journey['flights'][0]['dateArrival']
                    
                    inbound_flight = journey['flights'][0]['number']
                    inbound_company = journey['flights'][0]['companyCode']
                    
                    inbound_flight_code = f'{outbound_company}{outbound_flight}'
                    
                    inbound_stops = len(other_journey['flights']) - 1
                    
                    if inbound_stops == 1:
                        inbound2_dep_airport = journey['flights'][-1]['airportDeparture']['code']
                        inbound2_arr_airport = journey['flights'][-1]['airportArrival']['code']
                    
                        inbound2_departure = journey['flights'][-1]['dateDeparture']
                        inbound2_arrival = journey['flights'][-1]['dateArrival']
                        
                        inbound2_flight = journey['flights'][-1]['number']
                        inbound2_company = journey['flights'][-1]['companyCode']

                        inbound2_flight_code = f'{inbound2_company}{inbound2_flight}'
                        
                    elif inbound_stops == 0:
                        inbound2_dep_airport = ''
                        inbound2_arr_airport = ''
                    
                        inbound2_departure = ''
                        inbound2_arrival = ''
                        
                        inbound2_flight_code = ''
                    
                    else:
                        continue
                        
                    combination_identifier_list = [
                        outbound_dep_airport,          
                        outbound_arr_airport,
                        outbound_departure,
                        outbound_arrival,
                        inbound_dep_airport,
                        inbound_arr_airport,
                        inbound_departure,
                        inbound_arrival
                    ]
                    
                    combination_identifier = '-'.join(combination_identifier_list)
                    
                    if combination_identifier not in combination_ids:
                        combination_ids[combination_identifier] = i
                        i += 1

                    combination_dict = {
                        "Combination": combination_ids[combination_identifier],
                        "Price": total_price,
                        "Base Fare": combined_price,
                        "Taxes": total_tax,
                        "outbound fare": journey['fareFamily']['description'],
                        "inbound fare": other_journey['fareFamily']['description'],
                        "outbound 1 airport departure": outbound_dep_airport,
                        "outbound 1 airport arrival": outbound_arr_airport,
                        "outbound 1 time departure": outbound_departure,
                        "outbound 1 time arrival": outbound_arrival,
                        "outbound 1 flight number": outbound_flight_code,
                        "outbound 2 airport departure": outbound2_dep_airport,
                        "outbound 2 airport arrival": outbound2_arr_airport,
                        "outbound 2 time departure": outbound2_departure,
                        "outbound 2 time arrival": outbound2_arrival,
                        "outbound 2 flight number": outbound2_flight_code,
                        "inbound 1 airport departure": inbound_dep_airport,
                        "inbound 1 airport arrival": inbound_arr_airport,
                        "inbound 1 time departure": inbound_departure,
                        "inbound 1 time arrival": inbound_arrival,
                        "inbound 1 flight number": inbound_flight_code,
                        "inbound 2 airport departure": inbound2_dep_airport,
                        "inbound 2 airport arrival": inbound2_arr_airport,
                        "inbound 2 time departure": inbound2_departure,
                        "inbound 2 time arrival": inbound2_arrival,
                        "inbound 2 flight number": inbound2_flight_code,
                        "outbound stops": outbound_stops,
                        "inbound stops": inbound_stops,    
                    }
                    
                    if combination_dict not in combinations:
                        combinations.append(combination_dict)
            

sorted_combinations = sorted(combinations, key=lambda x: x[next(iter(x))])

# Convert the list of combinations to a DataFrame
sorted_combinations_df = pd.DataFrame(sorted_combinations)

# Save the DataFrame to a CSV file
sorted_combinations_df.to_csv('combinations.csv', index=False)

5. Find the cheapest price option for each flight combination.
* Data can be saved into a CSV file for examination.

In [36]:
# Grouping by "Combination" key
grouped_data = defaultdict(list)
for combination in sorted_combinations:
    grouped_data[combination['Combination']].append(combination)

# Finding the cheapest for each group
cheapest_flights = []
for group, flights in grouped_data.items():
    cheapest_flight = min(flights, key=lambda x: x['Price'])
    cheapest_flights.append(cheapest_flight)
    
cheapest_flights_df = pd.DataFrame(cheapest_flights)
cheapest_flights_df.to_csv('cheapest_flights.csv', index=False)

6. Make sure that scraper can work with flights having 1 connection (example routes: JFK -AUH, CPH-
MAD). Flights having 2 connections must be skipped.

In [37]:
# create list to store filtered flight combinations
filtered_combinations = []

for combination in cheapest_flights:
    if combination['outbound stops'] <= 1 and combination['inbound stops'] <= 1:
        filtered_combinations.append(combination)

print("Number of filtered combinations:", len(filtered_combinations))

Number of filtered combinations: 1


7. Make sure that scraper can work with any search parameter set (origin, destination, dates).

In [18]:
# To gather and save data, we can use our function called flight_save_data, that would gather info about the flight, create json file and put the info there.
# Change origins here, and adjust depart_date and return_Date in first function

data = flight_save_data("MAD","AUH", depart_date, return_date)


Data saved to MAD-AUH_2024-03-09_to_2024-03-23.json


8. Save extracted data into CSV file using multiple search parameter sets (choose 10 of any search
parameters you want).

In [38]:
# Dropping columns to match the task CSV example

columns_to_drop = ['Combination', 'Base Fare', 'outbound fare', 'inbound fare', 'outbound stops', 'inbound stops']
filtered_combinations_df = cheapest_flights_df.drop(columns=columns_to_drop)

# Save the DataFrame to a CSV file with final data as example in task
filtered_combinations_df.to_csv('final_data.csv', index=False)