In [3]:
%matplotlib inline

In [4]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 3.0 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.0


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
from catboost import CatBoostClassifier
from category_encoders.target_encoder import TargetEncoder


In [7]:
# Don't forget to upload file with data

df = pd.read_csv('flight_delays_train.csv')
print(df.shape)
df.head()

(100000, 9)


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [8]:

X = df[['Distance', 'DepTime']].values
y = df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

## XGBoost and Catboost (default)

In [9]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)
roc_auc_score(y_test, model_xgb.predict_proba(X_test)[:, 1])

0.696684364827947

In [10]:
model_cat = CatBoostClassifier()
model_cat.fit(X_train, y_train, verbose=False)
roc_auc_score(y_test, model_cat.predict_proba(X_test)[:, 1])

0.7008986308209054

## Предобработка всех категориаьных переменных и обучение XGBoost и Catboost

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Month              100000 non-null  object
 1   DayofMonth         100000 non-null  object
 2   DayOfWeek          100000 non-null  object
 3   DepTime            100000 non-null  int64 
 4   UniqueCarrier      100000 non-null  object
 5   Origin             100000 non-null  object
 6   Dest               100000 non-null  object
 7   Distance           100000 non-null  int64 
 8   dep_delayed_15min  100000 non-null  object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB


In [12]:
df = df.drop_duplicates();
columns = np.array(df.columns)
numeric_columns = list(df._get_numeric_data().columns)
categor_columns = set(columns) - set(numeric_columns)
print(categor_columns)

# проверим быстренько заспамленность
for i in categor_columns:
  if max(df[i].value_counts(normalize=True))>0.9:
    print(df[i].value_counts(normalize=True))
  #print(df[i].isnull().any())
  print(f"Количество уникальных значений в столбце {i}: {len(np.unique(df[i]))}")

{'Dest', 'DayofMonth', 'DayOfWeek', 'Month', 'Origin', 'dep_delayed_15min', 'UniqueCarrier'}
Количество уникальных значений в столбце Dest: 289
Количество уникальных значений в столбце DayofMonth: 31
Количество уникальных значений в столбце DayOfWeek: 7
Количество уникальных значений в столбце Month: 12
Количество уникальных значений в столбце Origin: 289
Количество уникальных значений в столбце dep_delayed_15min: 2
Количество уникальных значений в столбце UniqueCarrier: 22


In [13]:
df_transformed = df.copy()

df_transformed['dep_delayed_15min'] = df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values

for i in ['DayofMonth', 'DayOfWeek', 'Month']:
  	df_transformed[i] = df_transformed[i].str.extract('(\d+)').astype(int)

for i in ['Dest', 'Origin', 'UniqueCarrier']:
  te=TargetEncoder()
  df_transformed[i]=te.fit_transform(df_transformed[i],df_transformed.dep_delayed_15min)



In [14]:
X_transformed = df_transformed.drop(['dep_delayed_15min'], axis=1).values
y_transformed = df_transformed['dep_delayed_15min'].values

X_train_transformed, X_test_transformed, y_train_transformed, y_test_transformed = train_test_split(X_transformed, y_transformed, test_size=0.1, stratify=y)

### XGBoost

In [15]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train_transformed, y_train_transformed)

print(f'ROC AUC score for XGBClassifier: {roc_auc_score(y_test_transformed, model_xgb.predict_proba(X_test_transformed)[:, 1])}')

ROC AUC score for XGBClassifier: 0.7121358949953498


### Catboost

In [16]:
X = df.drop(['dep_delayed_15min'], axis=1)
y = df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values

cat_features = set(X.columns) - set(X._get_numeric_data().columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

model_cat = CatBoostClassifier(cat_features=list(cat_features))
model_cat.fit(X_train, y_train, verbose=False)

print(f'ROC AUC score for Catboost: {roc_auc_score(y_test, model_cat.predict_proba(X_test)[:, 1])}')

ROC AUC score for Catboost: 0.7751500767055834
