In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task

In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv("/opt/spark_data/expo/1990.csv")

In [4]:
len(data)

5270893

# Target calculation

In [5]:
data["total_delay"] = data["ActualElapsedTime"] - data["CRSElapsedTime"]

In [6]:
data["total_delay"].head()

0   -5.0
1    8.0
2   -7.0
3   -8.0
4    3.0
Name: total_delay, dtype: float64

In [7]:
data.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'total_delay'],
      dtype='object')

In [8]:


# a = df.sample(10000, random_state=1)
# b = pd.merge(a, airport_df, left_on="Origin", right_on="iata")
# c = pd.merge(b, airport_df, left_on="Dest", right_on="iata")
# c.rename(columns={"lat_x": "origin_lat", "long_x": "origin_long", "lat_y": "dest_lat", "long_y": "dest_long"}, inplace=True)

# features = ["Year", "Month", "DayOfWeek", "CRSDepTime", "CRSArrTime", "CRSElapsedTime", "Distance", "origin_lat", "origin_long", "dest_lat", "dest_long"]
# model_df = c[features]
# model_df["total_delay"] = np.where(c["total_delay"] > 0, 1, 0)
# model_df.dropna(inplace=True)

data["total_delay"] = np.where(data["total_delay"] > 0, 1, 0)




In [9]:
data["total_delay"].head()

0    0
1    1
2    0
3    0
4    1
Name: total_delay, dtype: int64

In [10]:
data.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'total_delay'],
      dtype='object')

In [11]:
processed_data = data.drop(["ActualElapsedTime"], axis=1)

In [12]:
processed_data.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',
       'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode',
       'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
       'LateAircraftDelay', 'total_delay'],
      dtype='object')

In [13]:
# Year, Month, DayOfWeek, CRSDepTime, CRSArrTime, CRSElapsedTime, 
# Distance, origin_lat, origin_long, dest_lat, dest_long

processed_data = processed_data[['Year', 'Month', 'DayOfWeek', 
                                 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime', 
                                 'Distance', 'total_delay']]




In [14]:
train, test = train_test_split(processed_data, test_size=1000_000, random_state=42)

In [15]:
print(len(train))
print(len(test))

4270893
1000000


In [16]:
test.head()

Unnamed: 0,Year,Month,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,total_delay
2548738,1990,6,5,900,1007,67,247.0,0
4774834,1990,11,3,1855,1934,39,106.0,0
4081781,1990,10,3,1930,2107,217,1494.0,1
1102728,1990,3,5,1654,1745,51,190.0,0
5230804,1990,12,6,1856,2015,79,395.0,0


In [17]:
roles = {
    "target": "total_delay"
}

In [18]:
task = Task(
    "binary",
)

In [19]:
automl = TabularAutoML(
    task=task,
    timeout=6000,
    general_params={
        "use_algos": [["lgb"]]
    },
    nested_cv_params={"cv": 2, "n_folds": None},
)

In [20]:
oof_pred = automl.fit_predict(train, roles=roles)

tdf ctr
<class 'lightautoml.transformers.base.UnionTransformer'>
<class 'lightautoml.transformers.base.UnionTransformer'>


In [21]:
test_pred = automl.predict(test)

In [22]:
not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

In [23]:
print(f"OOF score: {roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])}")
print(f"TEST score: {roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])}")

OOF score: 0.6944394386738393
TEST score: 0.7038865488142378
