## Overview

Welcome to the **Software Engineer (ML & LLMs)** Application Challenge.
In this, you will have the opportunity to get closer to a part of the reality of the role, and demonstrate your skills and knowledge in machine learning and cloud.

## Problem

Predict the probability of **delay** for a flight taking off or landing at SCL airport.

|Column|Description|
|-----|-----------|
|`Fecha-I`|Scheduled date and time of the flight.|
|`Vlo-I`|Scheduled flight number.|
|`Ori-I`|Programmed origin city code.|
|`Des-I`|Programmed destination city code.|
|`Emp-I`|Scheduled flight airline code.|
|`Fecha-O`|Date and time of flight operation.|
|`Vlo-O`|Flight operation number of the flight.|
|`Ori-O`|Operation origin city code.|
|`Des-O`|Operation destination city code.|
|`Emp-O`|Airline code of the operated flight.|
|`DIA`|Day of the month of flight operation.|
|`MES`|Number of the month of operation of the flight.|
|`AÑO`|Year of flight operation.|
|`DIANOM`|Day of the week of flight operation.|
|`TIPOVUELO`|Type of flight, I =International, N =National.|
|`OPERA`|Name of the airline that operates.|
|`SIGLAORI`|Name city of origin.|
|`SIGLADES`|Destination city name.|

|Column|Description|
|-----|-----------|
|`high_season`|1 if `Date-I` is between Dec-15 and Mar-3, or Jul-15 and Jul-31, or Sep-11 and Sep-30, 0 otherwise.|
|`min_diff`|difference in minutes between `Date-O` and `Date-I`|
|`period_day`|morning (between 5:00 and 11:59), afternoon (between 12:00 and 18:59) and night (between 19:00 and 4:59), based on `Date-I`.|
|`delay`|1 if `min_diff` > 15, 0 if not.|

In [None]:
import pandas as pd

data = pd.read_csv('./04/flights.csv')
data.info()

In [None]:
# `min_diff`|difference in minutes between `Date-O` and `Date-I`|

from datetime import datetime

def get_min_diff(data):
    # 2017-01-01 23:30:00
    fecha_o = datetime.strptime(data["Fecha-O"], "%Y-%m-%d %H:%M:%S")
    fecha_i = datetime.strptime(data["Fecha-I"], "%Y-%m-%d %H:%M:%S")
    min_diff = (fecha_o - fecha_i).total_seconds() / 60
    return min_diff

data["min_diff"] = data.apply(get_min_diff, axis=1)

In [None]:
data.head()

In [None]:
import numpy as np

data["delay"] = np.where(data["min_diff"] > 15, 1, 0)

In [None]:
data.head()

In [None]:
data['delay'].value_counts()

In [None]:
opera_by_delay = data[data["delay"]==1]["OPERA"].value_counts()
opera_by_delay.head()

opera_by_not_delay = data[data["delay"]==0]["OPERA"].value_counts()
opera_by_not_delay.head()

df_by_opera = pd.DataFrame([opera_by_delay, opera_by_not_delay])
df_by_opera.index = ["Delay", "No Delay"]
df_by_opera.plot(kind="bar", stacked=True, title="Delay/NoDelay by OPERA")

In [None]:
data["OPERA"].value_counts()

In [None]:
data["DIA"].value_counts()

In [None]:
data["MES"].value_counts()

In [None]:
data["TIPOVUELO"].value_counts()

In [None]:
tipo_vuelo_by_delay = data[data["delay"]==1]["TIPOVUELO"].value_counts()
tipo_vuelo_by_delay.head()

tipo_vuelo_by_not_delay = data[data["delay"]==0]["TIPOVUELO"].value_counts()
tipo_vuelo_by_not_delay.head()

df_by_tipo_vuelo = pd.DataFrame([tipo_vuelo_by_delay, tipo_vuelo_by_not_delay])
df_by_tipo_vuelo.index = ["Delay", "No Delay"]
df_by_tipo_vuelo.plot(kind="bar", stacked=True, title="Delay/NoDelay by TipoVuelo")

In [None]:
!pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

flights_delayed_by_destination = data[data["delay"]==1]["SIGLADES"].value_counts()
plt.figure(figsize = (10, 2))
sns.set(style="darkgrid")
sns.barplot(x=flights_delayed_by_destination.index, y = flights_delayed_by_destination.values)
plt.title("Flights delayed by destination")
plt.xticks(rotation = 90)
plt.show()


In [None]:
# |`period_day`|
# morning (between 5:00 and 11:59)
# afternoon (between 12:00 and 18:59)
# night (between 19:00 and 4:59), based on `Date-I`.|

from datetime import datetime

def get_period_day(date):
    date_time = datetime.strptime(date, "%Y-%m-%d %H:%M:%S").time()
    morning_min = datetime.strptime("05:00", "%H:%M").time()
    morning_max = datetime.strptime("11:59", "%H:%M").time()
    afternoon_min = datetime.strptime("12:00", "%H:%M").time()
    afternoon_max = datetime.strptime("18:59", "%H:%M").time()
    evening_min = datetime.strptime("19:00", "%H:%M").time()
    evening_max = datetime.strptime("23:59", "%H:%M").time()
    night_min = datetime.strptime("00:00", "%H:%M").time()
    night_max = datetime.strptime("04:59", "%H:%M").time()
    if date_time > morning_min and date_time < morning_max:
        return "morning"
    elif date_time > afternoon_min and date_time < afternoon_max:
        return "afternoon"
    elif (
        (date_time > evening_min and date_time < evening_max) or
        (date_time > night_min and date_time < night_max)
    ):
        return "night"


data["period_day"] = data["Fecha-I"].apply(get_period_day)

In [None]:
data.head()

In [None]:
period_day_by_delay = data[data["delay"]==1]["period_day"].value_counts()
period_day_by_delay.head()

period_day_by_not_delay = data[data["delay"]==0]["period_day"].value_counts()
period_day_by_not_delay.head()

df_by_period_day = pd.DataFrame([period_day_by_delay, period_day_by_not_delay])
df_by_period_day.index = ["Delay", "No Delay"]
df_by_period_day.plot(kind="bar", stacked=True, title="Delay/NoDelay by period_day")

In [None]:
# |`high_season`|
# 1 if `Date-I` is between Dec-15 and Mar-3
# or Jul-15 and Jul-31
# or Sep-11 and Sep-30
# 0 otherwise.|

from datetime import datetime

def is_high_season(fecha_raw):
    fecha = datetime.strptime(fecha_raw, "%Y-%m-%d %H:%M:%S")
    fecha_year = fecha.year
    range_1_min = datetime.strptime("15-Dec", "%d-%b").replace(year = fecha_year)
    range_1_max = datetime.strptime("31-Dec", "%d-%b").replace(year = fecha_year)
    range_2_min = datetime.strptime("1-Jan", "%d-%b").replace(year = fecha_year)
    range_2_max = datetime.strptime("3-Mar", "%d-%b").replace(year = fecha_year)
    range_3_min = datetime.strptime("15-Jul", "%d-%b").replace(year = fecha_year)
    range_3_max = datetime.strptime("31-Jul", "%d-%b").replace(year = fecha_year)
    range_4_min = datetime.strptime("11-Sep", "%d-%b").replace(year = fecha_year)
    range_4_max = datetime.strptime("30-Sep", "%d-%b").replace(year = fecha_year)
    if (
        (fecha >= range_1_min and fecha <= range_1_max) or
        (fecha >= range_2_min and fecha <= range_2_max) or
        (fecha >= range_3_min and fecha <= range_3_max) or
        (fecha >= range_4_min and fecha <= range_4_max)
    ):
        return 1
    return 0

data["high_season"] = data["Fecha-I"].apply(is_high_season)

In [None]:
data.head()

In [None]:
high_season_by_delay = data[data["delay"]==1]["high_season"].value_counts()
high_season_by_not_delay = data[data["delay"]==0]["high_season"].value_counts()

df_by_high_season = pd.DataFrame([high_season_by_delay, high_season_by_not_delay])
df_by_high_season.index = ["Delay", "No Delay"]
df_by_high_season.plot(kind="bar", stacked=True, title="Delay/NoDelay by high season")

In [None]:
dia_nombre_by_delay = data[data["delay"]==1]["DIANOM"].value_counts()
dia_nombre_by_not_delay = data[data["delay"]==0]["DIANOM"].value_counts()

df_by_dia_nombre = pd.DataFrame([dia_nombre_by_delay, dia_nombre_by_not_delay])
df_by_dia_nombre.index = ["Delay", "No Delay"]
df_by_dia_nombre.plot(kind="bar", stacked=True, title="Delay/NoDelay by DiaNombre")

In [None]:
data.info()

In [None]:
data.to_csv('./04/flights-with-delay.csv')

In [None]:
data["SIGLAORI"].nunique() # delete