In [None]:
#imports
import csv
import json
import requests
import traceback
import pandas as pd 
from bs4 import BeautifulSoup
from collections import defaultdict 
from requests.exceptions import JSONDecodeError

1. Implement a pageload to get the JSON data with your scraper.

In [8]:
# Creating function to scrape data based on set parameters. 
def flight_save_data(from_city, to_city, depart_date, return_date):
    url = "http://homeworktask.infare.lt/search.php"
    params = {"from": from_city, "to": to_city, "depart": depart_date, "return": return_date}
    
    try:
        response = requests.get(url, params=params)
        data = response.json()
        file_name = f"{from_city}-{to_city}_{depart_date}_to_{return_date}.json"
        with open(file_name, 'w') as file:
            json.dump(data, file, indent=4)
        print(f"Data saved to {file_name}")
        return data
    except Exception as e:
        print(f"Route not available")
        return None
              


# using bs4 to scrape from/to options 
r = requests.get(f'http://homeworktask.infare.lt/')
html_file = r.text
soup = BeautifulSoup(html_file, "html.parser")

# Get all from and to options, added option - if option["value"] which checks if value even exists
from_options = [option["value"] for option in soup.find("select", id="from").find_all("option") if option["value"]]
to_options = [option["value"] for option in soup.find("select", id="to").find_all("option") if option["value"]]


# select depart/return dates.
depart_date = "2024-03-03"
return_date =  "2024-03-29"

""" Commenting as task does not require to scrape all json data.
# let's create a loop to go through each flight and extract information.
json_data_list = [] #creating empty list to store scraped json data
for from_city in from_options:
    for to_city in to_options:
        if from_city != to_city:
            json_data = flight_save_data(from_city, to_city, depart_date, return_date)
            if json_data:
                json_data_list.append(json_data)
"""               

" Commenting as task does not require to scrape all json data.\n# let's create a loop to go through each flight and extract information.\njson_data_list = [] #creating empty list to store scraped json data\nfor from_city in from_options:\n    for to_city in to_options:\n        if from_city != to_city:\n            json_data = flight_save_data(from_city, to_city, depart_date, return_date)\n            if json_data:\n                json_data_list.append(json_data)\n"

2. Extract outbound and inbound flight data flying from MAD to AUH. You may choose any dates.

In [10]:
# we can re-use our function flight_save_data to gather required data
# it also creates a json file with the given parameters.

data = flight_save_data("MAD","AUH", depart_date, return_date)



Data saved to MAD-AUH_2024-03-03_to_2024-03-29.json


3. Make outbound and Inbound flight combinations for each price category (roundtrip flights).

In [4]:
# All our craped data
price_data = json_data

# empty list for flight combinations
flight_combinations = []

# loop to extract the price that is nested in body/data/totaAvailabilities
for prices in price_data["body"]["data"]["totalAvailabilities"]:
    recommendation_id = prices["recommendationId"]
    total_price = prices["total"]
    flight_combinations.append({"recommendationId": recommendation_id, "totalPrice": total_price})

# print total price for each category
for combination in flight_combinations:
    print(f"Recommendation ID: {combination['recommendationId']}, Total Price: {combination['totalPrice']}")



Recommendation ID: 3, Total Price: 375.38
Recommendation ID: 10, Total Price: 446.38
Recommendation ID: 11, Total Price: 454.38
Recommendation ID: 17, Total Price: 504.38
Recommendation ID: 21, Total Price: 525.38
Recommendation ID: 22, Total Price: 525.38
Recommendation ID: 27, Total Price: 575.38
Recommendation ID: 28, Total Price: 575.38
Recommendation ID: 31, Total Price: 603.38
Recommendation ID: 34, Total Price: 654.38
Recommendation ID: 35, Total Price: 654.38
Recommendation ID: 36, Total Price: 689.38
Recommendation ID: 37, Total Price: 704.38
Recommendation ID: 38, Total Price: 725.38
Recommendation ID: 43, Total Price: 760.38
Recommendation ID: 44, Total Price: 760.38
Recommendation ID: 49, Total Price: 796.38
Recommendation ID: 50, Total Price: 796.38
Recommendation ID: 60, Total Price: 838.38
Recommendation ID: 61, Total Price: 838.38
Recommendation ID: 69, Total Price: 875.38
Recommendation ID: 70, Total Price: 875.38
Recommendation ID: 71, Total Price: 889.38
Recommendati

4. Extract all available prices and calculate taxes for each combination.
* Data can be saved into a CSV file for examination.

In [5]:
# Loop to go through each flight and calculate taxes/base fare
for combination in flight_combinations:
    recommendation_id = combination["recommendationId"]
    journeys = [journey for journey in price_data["body"]["data"]["journeys"] if journey["recommendationId"] == recommendation_id]
    
    # summing taxes for the selected journeys
    total_taxes = sum(journey.get("importTaxAdl", 0) for journey in journeys)

    # Calculating base fare by subtracting total taxes from total price
    base_fare = combination["totalPrice"] - total_taxes

    combination["totalTaxes"] = total_taxes
    combination["baseFare"] = base_fare

# create csv headers(columns)
header = ['Recommendation ID', 'Price', 'Taxes', 'Base Fare']

# create/open csv file and put the info there
with open('price_tax_combination.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()
    for combination in flight_combinations:
        writer.writerow({
            'Recommendation ID': combination['recommendationId'],
            'Price': combination['totalPrice'],
            'Taxes': combination['totalTaxes'],
            'Base Fare': combination['baseFare']
        })
# Output is now accesible via "price_tax_combination.csv" file

5. Find the cheapest price option for each flight combination.
* Data can be saved into a CSV file for examination.

In [6]:
# extracting flight combinations
flight_combinations = json_data["body"]["data"]["journeys"]

# finding the cheapest flights for each combination
cheapest_combinations = {}

for journey in flight_combinations:
    recommendation_id = journey["recommendationId"]
    total_price = next((item["total"] for item in json_data["body"]["data"]["totalAvailabilities"] if item["recommendationId"] == recommendation_id), None)

    # Creating a unique identifier using flight numbers and dates
    identifier = '-'.join(f"{flight['number']}-{flight['dateDeparture']}" for flight in journey["flights"])

    if identifier not in cheapest_combinations or total_price < cheapest_combinations[identifier]["totalPrice"]:
        cheapest_combinations[identifier] = {
            "recommendationId": recommendation_id,
            "totalPrice": total_price
        }

# Writing tp csv file
with open('cheapest_combinations.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Unique Identifier', 'Recommendation ID', 'Total Price'])

    for identifier, data in cheapest_combinations.items():
        writer.writerow([identifier, data["recommendationId"], data["totalPrice"]])

# Access combinations csv via "cheapest_combinations.csv" file

6. Make sure that scraper can work with flights having 1 connection (example routes: JFK -AUH, CPH-
MAD). Flights having 2 connections must be skipped.

In [7]:
# create list to store filtered flight combinations
filtered_combinations = []

for combination in flight_combinations:
    
    journeys = [journey for journey in price_data["body"]["data"]["journeys"] if journey["recommendationId"] == combination["recommendationId"]]

    # Check if all journeys have 0 or 1 connection
    # Append to filtered_combinations if it's true
    if all(len(journey["flights"]) <= 2 for journey in journeys):
        filtered_combinations.append(combination)

print("Number of filtered combinations:", len(filtered_combinations))


Number of filtered combinations: 239


7. Make sure that scraper can work with any search parameter set (origin, destination, dates).

In [None]:
# To gather and save data, we can use our function called flight_save_data, that would gather info about the flight, create json file and put the info there.

flight_save_data("JFK","FUE", "2024-02-05", "2024-02-15")


8. Save extracted data into CSV file using multiple search parameter sets (choose 10 of any search
parameters you want).

In [12]:
# I will be using ALL scraped flight data from my first task

# empty list to store all flights
all_flights = []

# Use given headers as per CSV Homework example
headers = [
    "Price", "Taxes", 
    "outbound 1 airport departure", "outbound 1 airport arrival", "outbound 1 time departure", 
    "outbound 1 time arrival", "outbound 1 flight number",
    "outbound 2 airport departure", "outbound 2 airport arrival", "outbound 2 time departure", 
    "outbound 2 time arrival", "outbound 2 flight number",
    "inbound 1 airport departure", "inbound 1 airport arrival", "inbound 1 time departure", 
    "inbound 1 time arrival", "inbound 1 flight number",
    "inbound 2 airport departure", "inbound 2 airport arrival", "inbound 2 time departure", 
    "inbound 2 time arrival", "inbound 2 flight number"
]

# this loop goes through each journey nested in body/data
for journey in json_data["body"]["data"]["journeys"]:
    # Initialize flight_data with None for all fields
    flight_data = {key: None for key in headers}

    # Check if the journey is inbound or outbound
    direction = 'inbound' if journey["direction"] == 'I' else 'outbound'

    for i, segment in enumerate(journey["flights"], start=1):
        prefix = f"{direction} {i} "  # 'inbound 1', 'outbound 2', etc.
        # Extract and store flight details
        flight_data[prefix + "airport departure"] = segment["airportDeparture"]["code"]
        flight_data[prefix + "airport arrival"] = segment["airportArrival"]["code"]
        flight_data[prefix + "time departure"] = segment["dateDeparture"]
        flight_data[prefix + "time arrival"] = segment["dateArrival"]
        flight_data[prefix + "flight number"] = segment["number"]

    # Find the price information for the journey
    recommendation_id = journey["recommendationId"]
    price_info = next((item for item in json_data["body"]["data"]["totalAvailabilities"] if item["recommendationId"] == recommendation_id), None)
    # If price information is found, add price and tax details to the flight data
    if price_info:
        flight_data["Price"] = price_info["total"]
        flight_data["Taxes"] = sum(journey.get("importTaxAdl", 0) for journey in json_data["body"]["data"]["journeys"] if journey["recommendationId"] == recommendation_id)

    all_flights.append(flight_data)


# Write data to CSV
with open('flights_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()
    for flight in all_flights:
        writer.writerow(flight)
# Now it's possible to see the scraped data via file "Flights_data.csv"