In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 16 20:09:49 2022

@author: asaines
"""
%run functions.ipynb
#Set seed
seed = 123
#Import package / module for data
import pandas as pd
from seaborn import load_dataset
import missingno as msno
#Importing modules for Feature Engineering and modeling
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import model_selection, linear_model, preprocessing, ensemble, neighbors, tree, impute, svm, metrics, decomposition
from sklearn.metrics import accuracy_score,precision_score
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE

#Loading data sets
#data
df_airports = pd.read_csv('airports.csv')
df_airlines = pd.read_csv('airlines.csv')
df_hdata = pd.read_csv('historic_data.csv', low_memory=False).sample(frac=0.01)
df_future=pd.read_csv('future_data.csv', low_memory=False).sample(frac=0.01)
#adding new features
feat_hd, feat_fd=featureEngineering(df_hdata,df_future,df_airports )
### MODEL: DELAY
#dealing w/missing value 
#drop cancelled cases
delay_hd=feat_hd[feat_hd.CANCELLED==0]
delay_hd.drop(columns=['CANCELLED', 'CANCELLATION_REASON'], inplace=True)
delay_hd=delay_hd[delay_hd.arrival_delay.isna()==False]
# Check whether we have to deal with missing information in our dataset 
msno.matrix(delay_hd)
# Provide the numerical representation of missingness
miss = missingness(delay_hd)
miss[miss.percentage > 0]
# Based on this output we have an understanding of missingness 
#as well as an overview of the features for which we may consider imputation instead of dropping

#Partition data
delay_hd.drop(columns=['ARRIVAL_DELAY_15M'], inplace=True)
x_train_feat, x_test_feat = train_test_split(delay_hd, test_size=0.3, stratify=delay_hd.DEPARTURE_DELAY_15M, random_state=1)
y_train=x_train_feat.DEPARTURE_DELAY_15M
y_test= x_test_feat.DEPARTURE_DELAY_15M

###########
# ENCODING##
##############
# Handling with missing values- columns transformers
x_train,x_test= encoders(x_train_feat, x_test_feat)


#Modelling departure delays using pipe, grid_search
# imbalaced data - using ibmpipeline along with smote
pipe_lr= imbpipeline(steps = [['over', SMOTE(random_state=11)],
                               ['scaler', StandardScaler()],
                                ['clf', LogisticRegression(random_state=11, max_iter=1000)]])

# Construct grid_search
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=11)
jobs = -1

param_grid = {'clf__C':[0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=pipe_lr,
                           param_grid=param_grid,
                           scoring='precision',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(x_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(x_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
y_test_predict=grid_search.predict(x_test)