In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

In [2]:
scaler = MinMaxScaler()

Чтение датасетов

In [4]:
df = pd.read_csv('./bfd/train_dataset.csv')
df = df[(~df.TailNum.isnull()) & (~df.DepTime.isnull()) & (~df.AirTime.isnull())]

test = pd.read_csv('./bfd/test_dataset.csv').replace([np.nan, np.inf, -np.inf], 0)
test_ids = test.Id.values
test.drop('Id', axis=1, inplace=True)

Считаем простые признаки. Одновременно для трейн и тест выборок. Так меньше шансов ошибиться

In [5]:
df['speed'] = (60 * df.Distance/df.AirTime).replace([np.nan, np.inf, -np.inf], 0)
test['speed'] = (60 * test.Distance/test.AirTime).replace([np.nan, np.inf, -np.inf], 0)

In [6]:
df['diff_arr_dep_time'] = df.ArrTime - df.DepTime
test['diff_arr_dep_time'] = test.ArrTime - test.DepTime

In [7]:
df['elapsed_time'] = df.ArrTime + df.TaxiIn + df.TaxiOut
test['elapsed_time'] = test.ArrTime + test.TaxiIn + test.TaxiOut

In [8]:
def season(x):
    if x in [12, 1, 2]:
        return 'winter'
    elif x in [3,4,5]:
        return 'spring'
    elif x in [6,7,8]:
        return 'summer'
    else:
        return 'fall'

df['season'] = df.Month.apply(lambda x: season(x))
test['season'] = test.Month.apply(lambda x: season(x))

Перемешиваем датасет и разбиваем на 2 части. По одной части будем насчитывать статистику, а на другой будем обучать модель

In [9]:
df = shuffle(df)

x_count, x_train = train_test_split(df, test_size=0.3, random_state=42)
print(x_count.shape, x_train.shape)

(2399348, 19) (1028292, 19)


Дальше будет код для подсчета эмпирического(наблюдаемого) среднего по различным признакам. Обычно вместо того чтобы использовать среднее по историческим данным, используют так называемые <b>log odds</b>. Подумайте почему :)

In [10]:
import math
def logit(x):
    return math.log(x / (1-x))

def logit_vec(v):
    return logit(v.mean() + 0.00001) # guess why ?

In [11]:
day_mean = df.loc[:, ['DayOfWeek', 'target']].groupby('DayOfWeek').agg(logit_vec).reset_index()
day_mean.rename(columns={'target': 'day_mean_target'}, inplace=True)
day_mean.head()

Unnamed: 0,DayOfWeek,day_mean_target
0,1,-0.835936
1,2,-0.983948
2,3,-0.987021
3,4,-0.840437
4,5,-0.681012


In [12]:
month_mean = df.loc[:, ['Month', 'target']].groupby('Month').agg(logit_vec).reset_index()
month_mean.rename(columns={'target': 'month_mean_target'}, inplace=True)
month_mean.head()

Unnamed: 0,Month,month_mean_target
0,1,-0.716782
1,2,-0.563847
2,3,-0.612816
3,4,-0.947671
4,5,-0.985007


In [13]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
flight_c_volume = df.loc[:, ['target', 'UniqueCarrier']].groupby(['UniqueCarrier']).agg('count').reset_index()
flight_c_volume.rename(columns={'target': 'flight_carrier_volume'}, inplace=True)
flight_c_volume['flight_carrier_volume'] = scaler.fit_transform(flight_c_volume['flight_carrier_volume'].re)
flight_c_volume.head()

ValueError: Expected 2D array, got 1D array instead:
array=[ 126879.  292985.    3905.   74263.   96072.  146837.  221986.  137674.
   47752.  129324.   30532.  235645.  172009.   95265.  277287.  218996.
  223140.  592924.  181583.  122582.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [14]:
flight_volume = df.loc[:, ['target', 'TailNum']].groupby(['TailNum']).agg('count').reset_index()
flight_volume.rename(columns={'target': 'flight_volume'}, inplace=True)
flight_volume['flight_volume'] = scaler.fit_transform(flight_volume['flight_volume'])
flight_volume.head()

Unnamed: 0,TailNum,flight_volume
0,80009E,0.390379
1,80019E,0.395913
2,80059E,0.395913
3,80129E,0.407407
4,80139E,0.398467


In [15]:
season = x_count.loc[:, ['target', 'season']].groupby('season').agg(logit_vec).reset_index()
season.rename(columns={'target': 'season_empirical_mean'}, inplace=True)
season.head()

Unnamed: 0,season,season_empirical_mean
0,fall,-1.368798
1,spring,-0.841768
2,summer,-0.768482
3,winter,-0.563949


In [16]:
un_carrier = x_count.loc[:, ['target', 'UniqueCarrier']].groupby('UniqueCarrier').agg(logit_vec).reset_index()
un_carrier.rename(columns={'target': 'un_carrier_empirical_mean'}, inplace=True)
un_carrier.head()

Unnamed: 0,UniqueCarrier,un_carrier_empirical_mean
0,9E,-1.308809
1,AA,-0.664568
2,AQ,-2.060146
3,AS,-0.961066
4,B6,-0.857235


In [17]:
origin = x_count.loc[:, ['target', 'Origin']].groupby('Origin').agg(logit_vec).reset_index()
origin.rename(columns={'target': 'origin_empirical_mean'}, inplace=True)
origin.head()

Unnamed: 0,Origin,origin_empirical_mean
0,ABE,-1.33494
1,ABI,-1.653991
2,ABQ,-0.937903
3,ABY,-0.992502
4,ACK,0.072014


In [18]:
dest = x_count.loc[:, ['target', 'Dest']].groupby('Dest').agg(logit_vec).reset_index()
dest.rename(columns={'target': 'dest_empirical_mean'}, inplace=True)
dest.head()

Unnamed: 0,Dest,dest_empirical_mean
0,ABE,-0.879843
1,ABI,-0.634834
2,ABQ,-0.748326
3,ABY,-0.752761
4,ACK,-0.598793


По некоторым самолётам очень мало статистики. Поэтому здесь нужно либо воспользоваться обычным средним, либо размышлять над тем как сгладить средее. Поскольку это код бейзлайна, я просто воспользуюсь средним.

In [24]:
tailnum = x_count.loc[:, ['target', 'TailNum']].groupby('TailNum').agg('mean').reset_index()
tailnum.rename(columns={'target': 'tailnum_empirical_mean'}, inplace=True)
tailnum.head()

Unnamed: 0,TailNum,tailnum_empirical_mean
0,80009E,0.227907
1,80019E,0.203704
2,80059E,0.227692
3,80129E,0.219298
4,80139E,0.197836


In [25]:
x_train = pd.merge(x_train, season, on=['season'])
x_train = pd.merge(x_train, un_carrier, on=['UniqueCarrier'])
x_train = pd.merge(x_train, origin, on=['Origin'])
x_train = pd.merge(x_train, dest, on=['Dest'])
x_train = pd.merge(x_train, tailnum, on=['TailNum'])
x_train = pd.merge(x_train, flight_volume, on=['TailNum'])
x_train = pd.merge(x_train, month_mean, on=['Month'])
x_train = pd.merge(x_train, day_mean, on=['DayOfWeek'])
x_train = pd.merge(x_train, flight_c_volume, on=['UniqueCarrier'])

x_train.shape

(1028284, 28)

In [26]:
x_train = shuffle(x_train)

Будем пользоваться только перечисленными ниже фичами

In [29]:
numeric_features = [
    'Month',
    'DayofMonth',
    'DayOfWeek',
    'DepTime',
    'ArrTime',
    'AirTime',
    'Distance',
    'TaxiIn',
    'TaxiOut'
]

numeric_features += ['season_empirical_mean',
                     'un_carrier_empirical_mean', 
                     'origin_empirical_mean', 
                     'dest_empirical_mean',
                     'tailnum_empirical_mean',
                     'flight_volume',
                     'day_mean_target',
                     'month_mean_target',
                     'flight_carrier_volume']

In [30]:
test2 = test.copy()

In [31]:
test = pd.merge(test, season, on=['season'], how='left')
test = pd.merge(test, un_carrier, on=['UniqueCarrier'], how='left')
test = pd.merge(test, origin, on=['Origin'], how='left')
test = pd.merge(test, dest, on=['Dest'], how='left')
test = pd.merge(test, tailnum, on=['TailNum'], how='left')
test = pd.merge(test, flight_volume, on=['TailNum'], how='left')
test = pd.merge(test, month_mean, on=['Month'], how='left')
test = pd.merge(test, day_mean, on=['DayOfWeek'], how='left')
test = pd.merge(test, flight_c_volume, on=['UniqueCarrier'], how='left')

test.shape

(3504864, 27)

In [32]:
x_train = x_train.loc[:, numeric_features + ['target']]
test = test.loc[:, numeric_features]
print(x_train.shape, test.shape)
x_train.head()

((1028284, 19), (3504864, 18))


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,AirTime,Distance,TaxiIn,TaxiOut,season_empirical_mean,un_carrier_empirical_mean,origin_empirical_mean,dest_empirical_mean,tailnum_empirical_mean,flight_volume,day_mean_target,month_mean_target,flight_carrier_volume,target
511392,10,2,4,1109.0,1225.0,118.0,862,6.0,12.0,-1.368798,-0.664568,-0.712925,-0.84741,0.32,0.264368,-0.840437,-1.378737,0.490782,0
282282,11,12,3,1720.0,1804.0,89.0,458,5.0,10.0,-1.368798,-1.308809,-1.232271,-1.333947,0.226471,0.419327,-0.987021,-1.287702,0.208778,0
628875,9,5,5,1514.0,1714.0,97.0,665,6.0,17.0,-1.368798,-0.973997,-0.746635,-0.966722,0.311594,0.245211,-0.681012,-1.433591,0.370244,0
361391,11,24,1,835.0,1019.0,141.0,931,2.0,21.0,-1.368798,-1.143698,-1.24523,-1.19867,0.254279,0.249468,-0.835936,-1.287702,0.285397,0
965778,7,24,4,1821.0,1932.0,175.0,1262,5.0,11.0,-0.768482,-0.672045,-0.859095,-1.04171,0.352394,0.471264,-0.840437,-0.791878,1.0,0


### Кросс валидация

Обычно её делают так

In [33]:
cv = cross_val_score(estimator=LogisticRegression(), 
                X=x_train.drop('target', axis=1), 
                y=x_train.target.values,
                cv=5, # you may use 3 or 5
                scoring='roc_auc', 
                n_jobs=-1,)
print(cv.mean(), cv.std())
print(cv)

(0.7003069254165022, 0.0016122306786420014)
[0.70287593 0.69820403 0.7012404  0.69985583 0.69935844]


### Обучение модели

In [35]:
test.fillna(0, inplace=True)

In [36]:
clf = LogisticRegression(C=1.0)
clf.fit(x_train.drop('target', axis=1), x_train.target.values)
submission = clf.predict_proba(test)[:, 1]

In [37]:
test_ids.shape

(3504864,)

In [38]:
submission.shape

(3504864,)

In [39]:
pd.DataFrame({'Id': test_ids, 'Prediction1': submission}).to_csv('mean_target_mining_2.csv', index=False)