# Converting code into Pipeline

In [1]:
%load_ext autoreload
%autoreload 2

In [37]:
#importing python libraries
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.pipeline import FeatureUnion
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE, ADASYN

In [5]:
dataset_b_dir = '../raw_data/Dataset_B_FORWW_Kaggle'

csv_filename = 'All_Airports.csv'

df = pd.read_csv(os.path.join(dataset_b_dir,csv_filename), compression='zip')

# Handling NaNs & Renaming Cols

In [7]:
df = df.rename(columns={'WeatherDelay':'Weather_Delay_Length'})

In [10]:
df['Time'] = pd.to_datetime(df['Time'])

In [18]:
df['Hour'] = df['Time'].dt.hour
df['Day_Of_Week'] = df['Time'].dt.dayofweek
df['Month'] = df['Time'].dt.month

In [12]:

df['Weather_Delayed'] =  df.apply(lambda row: True if row['Weather_Delay_Length'] > 0 else False, axis=1)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15144514 entries, 0 to 15144513
Data columns (total 23 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   Time                    datetime64[ns]
 1   Origin                  object        
 2   Dest                    object        
 3   Carrier                 object        
 4   Cancelled               bool          
 5   CancellationReason      object        
 6   Delayed                 bool          
 7   DepDelayMinutes         float64       
 8   CarrierDelay            float64       
 9   Weather_Delay_Length    float64       
 10  NASDelay                float64       
 11  SecurityDelay           float64       
 12  LateAircraftDelay       float64       
 13  Temperature             float64       
 14  Feels_Like_Temperature  float64       
 15  Altimeter_Pressure      float64       
 16  Sea_Level_Pressure      float64       
 17  Visibility              float64       
 18  

In [30]:
num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object', 'bool'])
all_col = make_column_selector(dtype_include=['object', 'bool', 'float64', 'datetime64'])

num_transformer = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0.0), StandardScaler())
cat_transformer = OneHotEncoder(sparse_output=False, drop='if_binary')

#consider using if_binary param in ohe

In [31]:
preproc_pipe = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

In [29]:
preproc_pipe

In [23]:
X = df.drop(columns=['Weather_Delay_Length', 'Weather_Delayed'])
y = df['Weather_Delayed']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train_preproc = pd.DataFrame(preproc_pipe.fit_transform(X_train), columns=preproc_pipe.get_feature_names_out())
X_test_preproc = pd.DataFrame(preproc_pipe.transform(X_test), columns=preproc_pipe.get_feature_names_out())


In [35]:


pca_pipe = make_column_transformer(
    (all_col, PCA(n_components=10))
)

pca_pipe

In [38]:
smote_pipe = make_column_transformer(
    (all_col, SMOTE(random_state=42))
)

smote_pipe