In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import

In [34]:
import pandas as pd
import numpy as np
import random
import os
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer
from xgboost import XGBClassifier

In [35]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# csv to parquet
메모리에 효율적인 데이터 유형을 사용하여 용량을 줄이고 빠른 작업이 가능합니다

In [36]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [37]:
csv_to_parquet('/content/drive/MyDrive/월간데이콘_항공편지연/train.csv', 'train')
csv_to_parquet('/content/drive/MyDrive/월간데이콘_항공편지연/test.csv', 'test')

train Done.
test Done.


# Data Load

In [38]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('/content/drive/MyDrive/월간데이콘_항공편지연/sample_submission.csv', index_col = 0)

# 결측치 확인

In [39]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 19 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  890981 non-null   float64
 4   Estimated_Arrival_Time    890960 non-null   float64
 5   Cancelled                 1000000 non-null  int64  
 6   Diverted                  1000000 non-null  int64  
 7   Origin_Airport            1000000 non-null  object 
 8   Origin_Airport_ID         1000000 non-null  int64  
 9   Origin_State              890985 non-null   object 
 10  Destination_Airport       1000000 non-null  object 
 11  Destination_Airport_ID    1000000 non-null  int64  
 12  Destination_State         890921 non-null   object 
 13  Distance                  10

# 통계적 정보 확인

In [40]:
train.describe(include='all')

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
count,1000000,1000000.0,1000000.0,890981.0,890960.0,1000000.0,1000000.0,1000000,1000000.0,890985,1000000,1000000.0,890921,1000000.0,891080,891010,891003.0,1000000,255001
unique,1000000,,,,,,,374,,52,375,,52,,28,11,,6430,2
top,TRAIN_000000,,,,,,,ORD,,California,ORD,,California,,Southwest Airlines Co.,UA,,N483HA,Not_Delayed
freq,1,,,,,,,50436,,103482,50171,,104347,,182113,206211,,530,210001
mean,,6.945156,15.764842,1341.153019,1493.295934,0.0,0.0,,12696.278484,,,12701.813986,,784.078499,,,19997.388093,,
std,,3.462506,8.763515,489.814011,520.803494,0.0,0.0,,1514.938441,,,1515.213044,,590.790469,,,404.268639,,
min,,1.0,1.0,1.0,1.0,0.0,0.0,,10135.0,,,10135.0,,16.0,,,19393.0,,
25%,,4.0,8.0,925.0,1105.0,0.0,0.0,,11292.0,,,11292.0,,350.0,,,19790.0,,
50%,,7.0,16.0,1332.0,1524.0,0.0,0.0,,12889.0,,,12889.0,,623.0,,,19977.0,,
75%,,10.0,23.0,1742.0,1924.0,0.0,0.0,,14057.0,,,14057.0,,1020.0,,,20378.0,,


In [41]:
train[['Cancelled','Diverted']].describe()

Unnamed: 0,Cancelled,Diverted
count,1000000.0,1000000.0
mean,0.0,0.0
std,0.0,0.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,0.0,0.0


In [42]:
train.head(10)

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,
5,TRAIN_000005,4,13,1545.0,,0,0,EWR,11618,,DCA,11278,Virginia,199.0,Republic Airlines,UA,20452.0,N657RW,Not_Delayed
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,EWR,11618,New Jersey,BOS,10721,Massachusetts,200.0,United Air Lines Inc.,UA,,N66825,Not_Delayed
7,TRAIN_000007,4,20,1815.0,1955.0,0,0,ORD,13930,Illinois,MCI,13198,Missouri,403.0,,UA,20304.0,N110SY,
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,BWI,10821,,CLT,11057,North Carolina,361.0,Southwest Airlines Co.,WN,19393.0,N765SW,Not_Delayed
9,TRAIN_000009,6,6,650.0,838.0,0,0,LIT,12992,Arkansas,IAH,12266,Texas,374.0,ExpressJet Airlines Inc.,UA,20366.0,N14902,


# Data Pre-Processing

In [43]:
def to_days(x):
    month_to_days = {1:0, 2:31, 3:60, 4:91, 5:121, 6:152, 7:182, 8:213, 9:244, 10:274, 11:305, 12:335}
    return month_to_days[x]

train.loc[:, 'Day'] = train['Month'].apply(lambda x: to_days(x))
train['Day'] = train['Day'] + train['Day_of_Month']

test.loc[:, 'Day'] = test['Month'].apply(lambda x: to_days(x))
test['Day'] = test['Day'] + test['Day_of_Month']

train = train.astype({'Day':object})
test = test.astype({'Day':object})

print("Day Done.")

Day Done.


In [44]:
cond1 = train['Carrier_ID(DOT)'].isnull()
cond2 = ~train['Airline'].isnull()
print("Carrier_ID(DOT) 복구 가능한 데이터의 개수 :", len(train.loc[cond1 & cond2, :]))

Carrier_ID(DOT) 복구 가능한 데이터의 개수 : 97114


In [45]:
# airline to carrier id, dictinary 만들기
# 모두 데이터가 존재하는 열에서 Dict[Airline] = carrier_ID(DOT) 가 되도록 dictionary 생성
airline_to_cid = {}
for _, row in train[(~train['Carrier_ID(DOT)'].isnull() & ~train['Airline'].isnull())].iterrows():
    airline_to_cid[row['Airline']] = row['Carrier_ID(DOT)']

In [46]:
# 복구하기
def to_cid(x):
    return airline_to_cid[x]

cond1 = train['Carrier_ID(DOT)'].isnull()
cond2 = ~train['Airline'].isnull()
train.loc[cond1&cond2, 'Carrier_ID(DOT)'] = train.loc[cond1&cond2, 'Airline'].apply(lambda x: to_cid(x))

In [47]:
# 복구 안 된 row 빼기
train = train.dropna(subset=['Carrier_ID(DOT)'], how='any', axis=0)

In [48]:
# (Test Data Only)
# Airline, Carrier_Code 둘 다 없으면 최빈 값으로 대체
NaN_col = ['Carrier_ID(DOT)']
cond1 = test['Airline'].isnull()
cond2 = test['Carrier_ID(DOT)'].isnull()

for col in NaN_col:
    mode = test[col].mode()[0]
    test.loc[cond1&cond2, col] = mode

# 나머진 Airline에서 대체
cond1 = test['Carrier_ID(DOT)'].isnull()
cond2 = ~test['Airline'].isnull()
test.loc[cond1&cond2, 'Carrier_ID(DOT)'] = test.loc[cond1&cond2, 'Airline'].apply(lambda x: to_cid(x))

print("Cid Done.")

Cid Done.


In [49]:
col_drop = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 'Origin_Airport', 'Destination_Airport', 'Carrier_Code(IATA)', 'Airline', 'Origin_State', 'Destination_State']
train = train.drop(col_drop, axis=1)
test = test.drop(col_drop, axis=1)
print("Drop Done.")

Drop Done.


In [50]:
def to_minutes(x):
    x = int(x)
    x = str(x)
    if len(x) > 2:
        hours, mins = int(x[:-2]), int(x[-2:])
    else:
        hours, mins = 0, int(x[-2:])
    return hours*60+mins

estimated_times = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']

for ET in estimated_times:
    cond = ~train[ET].isnull()
    train.loc[cond, ET] = train.loc[cond, ET].apply(lambda x: to_minutes(x))
    cond2 = ~test[ET].isnull()
    test.loc[cond2, ET] = test.loc[cond2, ET].apply(lambda x: to_minutes(x))

In [51]:
train = train.dropna(subset=['Estimated_Arrival_Time', 'Estimated_Departure_Time'], how ='all', axis=0)

In [52]:
from collections import defaultdict
time_flying = defaultdict(int)
time_number = defaultdict(int)

cond_arr2 = ~train['Estimated_Arrival_Time'].isnull()
cond_dep2 = ~train['Estimated_Departure_Time'].isnull()

for _, row in train.loc[cond_arr2 & cond_dep2, :].iterrows():
    OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
    time_flying[(OAID,DAID)] += (row['Estimated_Arrival_Time'] - row['Estimated_Departure_Time'])%1440 # 하루 최대는 1440분
    time_number[(OAID,DAID)] += 1
    
    
for key in time_flying.keys():
    time_flying[key] /= time_number[key]

In [53]:
for index, row in train.loc[train['Estimated_Departure_Time'].isnull(),].iterrows():
    OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
    train.loc[index,'Estimated_Departure_Time'] = \
        (train.loc[index]['Estimated_Arrival_Time'] - time_flying[(OAID, DAID)])%1440
    
for index, row in train.loc[train['Estimated_Arrival_Time'].isnull(),].iterrows():
    OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
    train.loc[index,'Estimated_Arrival_Time'] = \
        (train.loc[index]['Estimated_Departure_Time'] + time_flying[(OAID, DAID)])%1440

In [54]:
# (Test Data Only)
# 둘 다 없으면 최빈값으로 대체
cond_1 = test['Estimated_Departure_Time'].isnull()
cond_2 = test['Estimated_Arrival_Time'].isnull()

mode = test['Estimated_Departure_Time'].mode()[0]
mode2 = test['Estimated_Arrival_Time'].mode()[0]
test.loc[cond_1&cond_2, ['Estimated_Departure_Time', 'Estimated_Arrival_Time']] = mode, mode2


# Departure만 없을 때,
for index, row in test.loc[test['Estimated_Departure_Time'].isnull(),].iterrows():
    OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
    test.loc[index,'Estimated_Departure_Time'] = \
        (test.loc[index]['Estimated_Arrival_Time'] - time_flying[(OAID, DAID)])%1440
    

# Arrival만 없을 때,
for index, row in test.loc[test['Estimated_Arrival_Time'].isnull(),].iterrows():
    OAID, DAID = row['Origin_Airport_ID'], row['Destination_Airport_ID']
    test.loc[index,'Estimated_Arrival_Time'] = \
        (test.loc[index]['Estimated_Departure_Time'] + time_flying[(OAID, DAID)])%1440

    
# 모두 int로 바꾼다.
estimated_times = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']
train = train.astype({'Estimated_Departure_Time':int, 'Estimated_Arrival_Time':int})
test = test.astype({'Estimated_Departure_Time':int, 'Estimated_Arrival_Time':int})
for ET in estimated_times:
    train.loc[train[ET] == 1440, ET] = 0
    test.loc[test[ET] == 1440, ET] = 0


print("EDT, EAT Done.")

EDT, EAT Done.


In [55]:
# EDT, EAT 48개의 bins에 담으면 된다. 1440(60*24) 계니까, 48씩 끊어서 하면 될 듯
estimate_times = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']
names = {'Estimated_Departure_Time':'EDT', 'Estimated_Arrival_Time':'EAT'}
for ET in estimated_times:
    for i in range(48):
        train.loc[train[ET].between(i*30, (i+1)*30, 'left'), names[ET]] = i
        test.loc[test[ET].between(i*30, (i+1)*30, 'left'), names[ET]] = i

train = train.astype({'EDT':int, 'EAT':int})
test = test.astype({'EDT':int, 'EAT':int})

train = train.drop(['Estimated_Departure_Time', 'Estimated_Arrival_Time'], axis=1)
test = test.drop(['Estimated_Departure_Time', 'Estimated_Arrival_Time'], axis=1)

print("EDT, EAT Done.")

EDT, EAT Done.


In [56]:
for i in range(51):
    train.loc[train['Distance'].between(i*100, (i+1)*100, 'left'), 'Distance'] = i
    test.loc[test['Distance'].between(i*100, (i+1)*100, 'left'), 'Distance'] = i

train = train.astype({'Distance':int})
test = test.astype({'Distance':int})

print("distance Done.")

train = train.astype({'Carrier_ID(DOT)':int})
test = test.astype({'Carrier_ID(DOT)':int})

train = train.astype({'EDT':object, 'EAT':object, 'Distance':object, 'Origin_Airport_ID':object, \
                     'Destination_Airport_ID':object, 'Carrier_ID(DOT)':object})
test = test.astype({'EDT':object, 'EAT':object, 'Distance':object, 'Origin_Airport_ID':object, \
                     'Destination_Airport_ID':object, 'Carrier_ID(DOT)':object})

print("CID Done.")

distance Done.
CID Done.


In [57]:
train = train.dropna()

column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))

train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

print('Training Prepared.')

Training Prepared.


In [58]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [66]:
train.head()

Unnamed: 0,ID,Origin_Airport_ID,Destination_Airport_ID,Distance,Carrier_ID(DOT),Tail_Number,Delay,Day,EDT,EAT,Delay_num
5,TRAIN_000005,11618,11278,1,20452,N657RW,Not_Delayed,104,31,34,0
6,TRAIN_000006,11618,10721,2,19977,N66825,Not_Delayed,20,35,38,0
8,TRAIN_000008,10821,11057,3,19393,N765SW,Not_Delayed,165,28,31,0
10,TRAIN_000010,11278,14122,2,20452,N119HQ,Delayed,226,35,37,1
12,TRAIN_000012,11042,11292,12,19393,N8696E,Not_Delayed,12,20,23,0


In [77]:
X = train.drop(columns=['ID', 'Delay', 'Delay_num'], axis = 1)
y = train['Delay_num']
test = test.drop(columns=['ID'])
X_train, X_valid , y_train, y_valid = train_test_split(X, y, test_size = 0.2, stratify = y)

KeyError: ignored

In [76]:
qual_col = ['Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(X_train[i])
    X_train[i]=le.transform(X_train[i])

    for label in np.unique(X_valid[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    X_valid[i]=le.transform(X_valid[i])

    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


.

In [71]:
# train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
train_x, val_x, train_y, val_y = X_train, X_valid , y_train, y_valid 
test_x = test

# Normalize numerical features
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)
test_x = scaler.transform(test_x)

In [74]:
model = XGBClassifier(
                      booster = 'gbtree',
                      scale_pos_weight=5,
                      learning_rate=0.01,  
                      colsample_bytree = 0.7,
                      subsample = 0.5,
                      max_delta_step = 3,
                      reg_lambda = 2,
                     objective='binary:logistic',
                      n_estimators=818, 
                      max_depth=8,
                     )
# %%time
eval_set = [(X_valid, y_valid)]
eval_metric = ["logloss"]
model.fit(X_train, y_train,early_stopping_rounds=50, eval_metric=eval_metric, eval_set=eval_set)



ValueError: ignored

In [72]:
# from sklearn.metrics import log_loss

# # Cross-validation with StratifiedKFold
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Model and hyperparameter tuning using GridSearchCV
# model = XGBClassifier(random_state=42)

# param_grid = {
#     'learning_rate': [0.01, 0.1],
#     'max_depth': [3, 5],
#     'n_estimators': [100, 200],
# }

# grid = GridSearchCV(model,
#                     param_grid,
#                     cv=cv,
#                     scoring='neg_log_loss',  # Change scoring to 'neg_log_loss'
#                     n_jobs=-1,
#                     verbose=1)

# grid.fit(train_x, train_y)

# best_model = grid.best_estimator_

# # Model evaluation
# val_y_pred = best_model.predict_proba(val_x)
# logloss = log_loss(val_y, val_y_pred)  # Compute log loss

# print(f'Log Loss: {logloss}')

# # Model prediction
# y_pred = best_model.predict_proba(test_x)
# submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
# submission.to_csv('optimized_submission_log_loss.csv', index=True)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Log Loss: 0.4369336268294423


In [73]:
# # 교육 데이터는 교육 및 검증 세트로 분할되고 수치 기능은 StandardScaler를 사용하여 정규화됩니다.
# # 모델은 GridSearchCV와 5겹 교차 검증을 사용하여 수행되는 하이퍼파라미터 튜닝과 함께 XGBClassifier를 사용하여 훈련됩니다.
# # Split the training dataset into a training set and a validation set
# train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

# # Normalize numerical features
# scaler = StandardScaler()
# train_x = scaler.fit_transform(train_x)
# val_x = scaler.transform(val_x)
# test_x = scaler.transform(test_x)

# # Cross-validation with StratifiedKFold
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Model and hyperparameter tuning using GridSearchCV
# model = XGBClassifier(random_state=42)

# param_grid = {
#     'learning_rate': [0.01, 0.1],
#     'max_depth': [3, 5],
#     'n_estimators': [100, 200],
# }

# grid = GridSearchCV(model,
#                     param_grid,
#                     cv=cv,
#                     scoring='accuracy',
#                     n_jobs=-1,
#                     verbose=1)

# grid.fit(train_x, train_y)

# best_model = grid.best_estimator_

# # Model evaluation
# val_y_pred = best_model.predict(val_x)
# accuracy = accuracy_score(val_y, val_y_pred)
# f1 = f1_score(val_y, val_y_pred, average='weighted')
# precision = precision_score(val_y, val_y_pred, average='weighted')
# recall = recall_score(val_y, val_y_pred, average='weighted')

# print(f'Accuracy: {accuracy}')
# print(f'F1 Score: {f1}')
# print(f'Precision: {precision}')
# print(f'Recall: {recall}')

# # 하이퍼파라미터 튜닝 결과를 바탕으로 최적의 모델을 선택하고 테스트 세트의 목표 변수를 예측하는 데 사용합니다.
# # Model prediction
# y_pred = best_model.predict_proba(test_x)
# submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
# submission.to_csv('optimized_submission.csv', index=True)