<a href="https://colab.research.google.com/github/fotonn/Modbus-RTU---OPC-UA/blob/master/air_delay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import math

import pandas as pd
import seaborn as sns
import numpy as np


from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import xgboost

In [0]:
data_dir = Path('../input')

train_df = pd.read_csv(data_dir / 'train.csv')
test_df = pd.read_csv(data_dir / 'test.csv')
sample_submission = pd.read_csv('../input/data_submit2 (1).csv')

field_names = ['date', 'flight', 'port_out', 'port_in', 'number', 'schedule_time_in', 'fact_time_in',
               'schedule_time_out', 'fact_time_out', 'delay_code', 'target']
test_field_names = ['date', 'flight', 'port_out', 'port_in', 'number', 'schedule_time_in', 'schedule_time_out']



train_df.columns = field_names
test_df.columns = test_field_names


train_df.fillna(0, inplace=True)


train_df['date'] = pd.to_datetime(train_df['date'])
train_df['schedule_time_in'] = pd.to_datetime(train_df['schedule_time_in'])
train_df['schedule_time_out'] = pd.to_datetime(train_df['schedule_time_out'])
train_df['fact_time_in'] = pd.to_datetime(train_df['fact_time_in'])
train_df['fact_time_out'] = pd.to_datetime(train_df['fact_time_out'])

test_df['date'] = pd.to_datetime(test_df['date'])
test_df['schedule_time_in'] = pd.to_datetime(test_df['schedule_time_in'])
test_df['schedule_time_out'] = pd.to_datetime(test_df['schedule_time_out'])

features = []

Читаем данные по аэропортам (https://data.humdata.org/dataset/ourairports-rus)
Нас интересует поле 'type' с размеров аэропорта.
Делаем из них boolean fields

In [0]:
ports = pd.read_csv(data_dir / 'ru-airports.csv')
ports = ports.rename(columns = {'iata_code': 'port_out'})
ports['large'] = (ports['type'] == 'large_airport').astype(float)
ports['medium'] = (ports['type'] == 'medium_airport').astype(float)
ports['small'] = (ports['type'] == 'small_airport').astype(float)
ports['closed'] = (ports['type'] == 'closed').astype(float)

In [0]:
ports['msk'] = (ports['iso_region'] == 'RU-MOS').astype(float)
ports['spb'] = (ports['iso_region'] == 'RU-SPE').astype(float)

In [0]:
ports.head()

Добавляем информацию о размере аэропорте отправления.

Для этого делаем merge с датасетов по аэропортам.

In [0]:
size_fields = ['large', 'medium', 'small', 'closed', 'msk', 'spb', 'port_out']

train_df = pd.merge(train_df, ports[size_fields], on=['port_out'], how='left')
test_df = pd.merge(test_df, ports[size_fields], on=['port_out'], how='left')

size_fields.pop()
features.extend(size_fields)

Аналогично добавляем информацию о размере аэропорте прибытия.

In [0]:
ports = ports.rename(columns = {'large': 'large_in', 'medium': 'medium_in', 'small': 'small_in',
                                'closed': 'closed_in', 'msk': 'msk_in', 'spb': 'spb_in', 'port_out': 'port_in'})
size_fields = ['large_in', 'medium_in', 'small_in', 'closed_in', 'msk_in', 'spb_in', 'port_in']

train_df = pd.merge(train_df, ports[size_fields], on=['port_in'], how='left')
test_df = pd.merge(test_df, ports[size_fields], on=['port_in'], how='left')

size_fields.pop()
features.extend(size_fields)

Фича "выходной"

In [0]:
train_df['weekday'] = pd.to_datetime(train_df['date']).dt.dayofweek
train_df["weekend"] = (train_df["weekday"] >= 5).astype(float)

test_df['weekday'] = pd.to_datetime(test_df['date']).dt.dayofweek
test_df["weekend"] = (test_df["weekday"] >= 5).astype(float)

features.extend(['weekend'])

Для трейн сета считаем задержку отправления и прибытия.

При этом отсекаем все, что меньше 0 и больше 50

In [0]:
train_df['time_in_delay'] = (train_df['fact_time_in'] - train_df['schedule_time_in']).astype('timedelta64[m]')
train_df.loc[train_df['time_in_delay'] < 5, 'time_in_delay'] = 0
train_df.loc[train_df['time_in_delay'] > 60, 'time_in_delay'] = 60

train_df['time_out_delay'] = (train_df['fact_time_out'] - train_df['schedule_time_out']).astype('timedelta64[m]')
train_df.loc[train_df['time_out_delay'] < 5, 'time_out_delay'] = 0
train_df.loc[train_df['time_out_delay'] > 60, 'time_out_delay'] = 60

Функция для генерации фич с группировкой:
1. Группируем по полям из списка groupby
2. Агреггируем по полу agg
3. Мерджим новые фичи в трэйн и тест датасет с префиксом name

In [0]:
def group_features(groupby, agg, name):
    global train_df, test_df
    df = train_df.groupby(groupby)[agg].aggregate(['sum', 'count', np.count_nonzero]).fillna(0)
    fields = ['{}_mean'.format(name), '{}_mean2'.format(name)]

    df['{}_mean'.format(name)] = df['sum'] / df['count']
    df['{}_mean2'.format(name)] = df['sum'] / df['count_nonzero']
    df = df[fields]
    
    train_df = pd.merge(train_df, df, on=groupby, how='left')
    test_df = pd.merge(test_df, df, on=groupby, how='left')

    return fields

Далее идет вызов функции для добавления фич:
1. по номеру рейса
2. по аэропорту отбытия
3. по аэропорту прибытия
4. по аэропорту отбытия и прибытия
5. по номеру самолета

Для каждого считается два вида среднего.

По всем рейсам и по рейсам с ненулевой задержкой.

In [0]:
features.extend(group_features('flight', 'time_in_delay', 'flight'))
features.extend(group_features('flight', 'time_out_delay', 'flight2'))

In [0]:
features.extend(group_features(['port_out'], 'time_in_delay', 'port_out'))
features.extend(group_features(['port_out'], 'time_out_delay', 'port_out2'))

In [0]:
features.extend(group_features(['port_in'], 'time_in_delay', 'port_in'))
features.extend(group_features(['port_in'], 'time_out_delay', 'port_in2'))

In [0]:
features.extend(group_features(['port_out', 'port_in'], 'time_in_delay', 'port'))
features.extend(group_features(['port_out', 'port_in'], 'time_out_delay', 'port2'))

In [0]:
features.extend(group_features('number', 'time_in_delay', 'number'))
features.extend(group_features('number', 'time_out_delay', 'number2'))

In [0]:
train_df.head()

In [0]:
test_df.head()

In [0]:
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]

In [0]:
scaler = StandardScaler(with_std=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
features

In [0]:
%%time
regressor = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1,
                                  max_depth=7)
regressor.fit(X_train, y_train)

In [0]:
mean_squared_error(regressor.predict(X_train), y_train)

In [0]:
y_test = regressor.predict(X_test)

In [0]:
xgb_model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=5,
                 min_child_weight=1.5,
                 n_estimators=100,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xgb_model.fit(X_train,y_train)
y_xgb = xgb_model.predict(X_test)

In [0]:
mean_squared_error(xgb_model.predict(X_train), y_train)

In [0]:
sample_submission['Задержка отправления в минутах'] = y_xgb
sample_submission.to_csv('xgboostmsk.csv', index=False)

In [0]:
seed = 7
test_size = 0.33
#y = df['y']
#X_train, X_test, y_train, y_test = train_test_split(boostd, y, test_size=test_size, random_state=seed)
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=10000, learning_rate=0.1, depth=5,loss_function='RMSE')
# Fit model
model.fit(X_train,y_train)
# Get predictions
y_cat = model.predict(X_test)


In [0]:
sample_submission['Задержка отправления в минутах'] = y_cat
sample_submission.to_csv('cat10k.csv', index=False)

In [0]:
mean_squared_error(model.predict(X_train), y_train)

mean_squared_error:

1996.6674058887513,

1435.0087880146991, 1434.3070641444779, 1433.945474164081