In [1]:
import os
import random
import pandas as pd
import numpy as np

import catboost
from catboost import CatBoostClassifier

import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
bold = ['\033[1m', '\033[0m']

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
train = pd.read_csv('Airline.csv')
test = pd.read_csv('../data/FlightDelay/test.csv')
submission = pd.read_csv('../data/FlightDelay/sample_submission.csv', index_col=0)

In [4]:
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000005,4,13,1545.0,,0,0,EWR,11618,New York,DCA,11278,Virginia,199.0,Republic Airlines,UA,20452.0,N657RW,Not_Delayed
1,TRAIN_000006,1,20,1742.0,1903.0,0,0,EWR,11618,New Jersey,BOS,10721,Massachusetts,200.0,United Air Lines Inc.,UA,,N66825,Not_Delayed
2,TRAIN_000008,6,13,1420.0,1550.0,0,0,BWI,10821,Florida,CLT,11057,North Carolina,361.0,Southwest Airlines Co.,WN,19393.0,N765SW,Not_Delayed
3,TRAIN_000010,8,13,1730.0,1844.0,0,0,DCA,11278,Virginia,PIT,14122,Pennsylvania,204.0,Republic Airlines,AA,,N119HQ,Delayed
4,TRAIN_000012,1,12,1015.0,1145.0,0,0,CLE,11042,Ohio,DEN,11292,Colorado,1201.0,Southwest Airlines Co.,WN,,N8696E,Not_Delayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251990,TRAIN_999962,10,11,,2003.0,0,0,SAT,14683,Texas,ORD,13930,Illinois,1041.0,SkyWest Airlines Inc.,UA,20304.0,N152SY,Not_Delayed
251991,TRAIN_999963,5,2,1759.0,1926.0,0,0,LGA,12953,New York,DCA,11278,Virginia,214.0,Republic Airlines,DL,20452.0,N871RW,Delayed
251992,TRAIN_999969,10,10,940.0,1056.0,0,0,MFE,13256,Texas,IAH,12266,Texas,316.0,Mesa Airlines Inc.,,20378.0,N89321,Delayed
251993,TRAIN_999985,8,8,1914.0,2039.0,0,0,RDU,14492,North Carolina,JAX,12451,Florida,407.0,Frontier Airlines Inc.,F9,20436.0,N316FR,Not_Delayed


In [5]:
train = train.drop(columns=['Cancelled', 'Diverted'])

In [6]:
NaN_col = ['Origin_State', 'Destination_State', 'Airline', 'Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [7]:
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])
print('Done.')

Done.


In [8]:
train = train.dropna()

In [9]:
column_number = {}
for i, column in enumerate(submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

Done.


In [10]:
FOLDS=15
folds=StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed_everything(42))
preds, scores = [0,0], []

for n_fold, (train_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    print(f'===================================={n_fold+1}============================================')
    
    train_X, val_X = train_x.iloc[train_idx],train_x.iloc[val_idx]
    train_Y, val_Y = train_y.iloc[train_idx],train_y.iloc[val_idx]
    
    # Classifier
    model = catboost.CatBoostRegressor(learning_rate=0.05, iterations=500, verbose=0)
    model.fit(train_X, train_Y, eval_set=(val_X, val_Y), early_stopping_rounds=100, cat_features=list(train_x.select_dtypes("category")))
    
    # val 예측
    val_pred = model.predict(val_X) # val 예측
    val_score = model.predict(val_pred)
    
    # test 데이터 예측 결과 종합
    # kolds 횟수 만큼 나눠서 평균 값을 활용
    preds += model.predict(test_x, prediction_type='Probability') / FOLDS
    
    
    # val 예측 LogLoss 값 저장
    print(f'FOLD {n_fold+1} | LogLoss: {round(val_score, 4)}')
    scores.append(val_score)
    
    print(f'================================================================================\n\n')
    
print(f'Final LogLoss: {bold[0]}{round(np.mean(scores), 6)}{bold[1]}')

FOLD 1 | LogLoss: 0.2138


FOLD 2 | LogLoss: 0.3309


FOLD 3 | LogLoss: 0.3298


FOLD 4 | LogLoss: 0.2643


FOLD 5 | LogLoss: 0.2958


FOLD 6 | LogLoss: 0.29


FOLD 7 | LogLoss: 0.2761


FOLD 8 | LogLoss: 0.3076


FOLD 9 | LogLoss: 0.3121


FOLD 10 | LogLoss: 0.2469


FOLD 11 | LogLoss: 0.351


FOLD 12 | LogLoss: 0.3136


FOLD 13 | LogLoss: 0.3448


FOLD 14 | LogLoss: 0.3304


FOLD 15 | LogLoss: 0.3356


Final LogLoss: [1m0.302843[0m


In [11]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,Month,18.547515
1,Estimated_Arrival_Time,13.440933
2,Estimated_Departure_Time,12.576941
3,Day_of_Month,9.476305
4,Carrier_ID(DOT),7.03683
5,Airline,4.904972
6,Origin_State,4.690914
7,Carrier_Code(IATA),4.565361
8,Destination_Airport,4.488319
9,Destination_State,4.303337


In [12]:
for i in range(len(submission)):
    submission['Delayed'][i], submission['Not_Delayed'][i] = preds[i][0], preds[i][1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['Delayed'][i], submission['Not_Delayed'][i] = preds[i][0], preds[i][1]


In [13]:
submission.to_csv('submission15.csv')