In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lime
import shap
import imblearn
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, mean_absolute_error, f1_score, roc_auc_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from timeit import default_timer as timer

In [8]:
pd.set_option('display.max_columns', None)

In [9]:
data_path = os.path.dirname(os.getcwd()) + "\_datasets"
df = pd.read_csv(data_path + "\939775908_T_ONTIME_REPORTING.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\mdro\\Desktop\\programming\\git\\Data_Science\\_datasets\\939775908_T_ONTIME_REPORTING.csv'

In [None]:
print(df.info())

In [None]:
print(df.isnull().any())

In [None]:
df.columns

In [None]:
df = df.drop(['ARR_DEL15', 'DEP_DEL15', 'DEP_DELAY_GROUP', 'TAXI_OUT', 'WHEELS_OFF', 'ORIGIN','WHEELS_ON', 'YEAR', 'TAXI_IN', 'TAXI_OUT', 'CRS_ARR_TIME', 'ARR_DELAY', 'ARR_TIME', 'DIVERTED', 'CRS_DEP_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'QUARTER', 'DEP_TIME', 'DIV_AIRPORT_LANDINGS', 'DEST', 'OP_UNIQUE_CARRIER', 'CANCELLATION_CODE'], 1)

In [None]:
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df = df.set_index('FL_DATE')
df = df.sort_values(by='FL_DATE')
df = df.dropna(how='all', axis='columns')
df = df.fillna(0)
df

In [None]:
def plot_heatmap(data):

    matrix = np.triu(data.corr())
    plt.figure(figsize=(16, 16))
    heatmap = sns.heatmap(data.corr(), annot=True, annot_kws={"fontsize": 8}, mask=matrix, linewidths=0.2, square=True)
    figure = heatmap.get_figure()
    
    return figure

In [None]:
corr = plot_heatmap(df)
corr

In [None]:
df.shape

In [None]:
df['CANCELLED'].value_counts()

In [None]:
X = df.drop('CANCELLED', 1)
y = df['CANCELLED']

under = RandomUnderSampler(sampling_strategy=0.33)
X, y = under.fit_resample(X, y)
over = RandomOverSampler(sampling_strategy=1)
X, y = over.fit_resample(X, y)

y.value_counts()

In [None]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                print('r')
                print(r)
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [None]:
models1 = {
        'RandomForestClassifier': RandomForestClassifier(),
#         'ExtraTreesClassifier': ExtraTreesClassifier(),
#         'GradientBoostingClassifier': GradientBoostingClassifier(),
#         'XGBClassifier': XGBClassifier()
          }

params1 = {
    'RandomForestClassifier': { 'n_estimators': [16, 32] },
    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
    'XGBClassifier': {'max_depth': [3,5], 'min_child_weight' : [3, 5]}
          }

In [None]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X, y, scoring='f1', n_jobs=2)

In [None]:
helper1.score_summary(sort_by='max_score')

In [None]:
def results_kfold(X, y, regression=True):

    result_dict = {}
    cv = KFold(n_splits=5, random_state=42, shuffle=True)
    
    reg_columns = ['RMSE', 'MAE', 'R2']
    clf_columns = ['F1', 'PRECIS', 'RECALL', 'ROC AUC', 'T POS', 'F POS', 'F NEG', 'T NEG']
            
    if regression:

        for name,regressor in regressors:

            rmse = []
            mae = []
            r2 = []

            for train_index, test_index in cv.split(X):
                X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
                score = regressor.fit(X_train, y_train)
                prediction = score.predict(X_test)

                rmse.append(np.sqrt(mean_squared_error(y_test, prediction)))
                mae.append(mean_absolute_error(y_test, prediction))
                r2.append(score.score(X_train, y_train))

            if name not in result_dict:
                result_dict[name] = []

            result_dict[name].append(np.mean(rmse))
            result_dict[name].append(np.mean(mae))
            result_dict[name].append(np.mean(r2))
               
        result_dict = pd.DataFrame.from_dict(result_dict, orient='index')
        result_dict.columns = reg_columns
            
    else:
        
        for name,classifier in classifiers:

            f1 = []
            precis = []
            recall = []
            roc_auc = []
            t_pos = []
            f_pos = []
            f_neg = []
            t_neg = []

            for train_index, test_index in cv.split(X):
                X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
                score = classifier.fit(X_train, y_train)
                prediction = score.predict(X_test)
                
                p_r_f1 = precision_recall_fscore_support(y_test, prediction, average='weighted')
                precis.append(p_r_f1[0])
                recall.append(p_r_f1[1])
                f1.append(p_r_f1[2])
                roc_auc.append(roc_auc_score(y_test, prediction))
          
                cm = confusion_matrix(y_test, prediction)
                t_pos.append(cm[0][0])
                f_pos.append(cm[0][1])
                f_neg.append(cm[1][0])
                t_neg.append(cm[1][1])

            if name not in result_dict:
                result_dict[name] = []

            result_dict[name].append(np.mean(f1))
            result_dict[name].append(np.mean(precis))
            result_dict[name].append(np.mean(recall))
            result_dict[name].append(np.mean(roc_auc))
            result_dict[name].append(np.mean(t_pos))
            result_dict[name].append(np.mean(f_pos))
            result_dict[name].append(np.mean(f_neg))
            result_dict[name].append(np.mean(t_neg))
            
        result_dict = pd.DataFrame.from_dict(result_dict, orient='index')
        result_dict.columns = clf_columns

    return result_dict

In [None]:
regressors = [['DecisionTreeRegressor :', DecisionTreeRegressor()],
              ['RandomForestRegressor :', RandomForestRegressor()],
              ['ExtraTreesRegressor :', ExtraTreesRegressor()],
              ['GradientBoostingRegressor :', GradientBoostingRegressor()],
              ['XGBRegressor :', XGBRegressor()]]

classifiers = [['RandomForestClassifier :', RandomForestClassifier()],
              ['ExtraTreesClassifier :', ExtraTreesClassifier()],
              ['GradientBoostingClassifier :', GradientBoostingClassifier()],
              ['XGBClassifier :', XGBClassifier()]]

In [None]:
classifier_results_kfold = results_kfold(X, y, regression=True)
classifier_results_kfold

In [None]:
model = XGBClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model.fit(X_train, y_train)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar")