In [4]:
from utils.train import Create
from utils.process import Format, Preprocess, Categorize, Assemble
from utils.generate import Generate
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
generate = Generate()
categorize = Categorize()
assemble = Assemble()

In [3]:
class Include(Create, Format, Preprocess):
    
    def include_feature_selection(self, model, key, model_object, X_train, X_test, y_train):
        selector_model = model[1] 
        selector_name = selector_model['feature_selection'][0]
        selector_algorithm = self.format_algorithm_string(selector_name)
        selector_parameters = selector_model['feature_selection'][1]

        print(f'Selecting features with {selector_name} for {key}')
        feature_selector = self.create_feature_selection_model(model_object, selector_algorithm, selector_parameters)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)
        X_test_selected = feature_selector.transform(X_test)
        
        return X_train_selected, X_test_selected
    
    def include_dimensionality_reduction(self, model, key, X_train, X_test, y_train):
        dimentionality_reduction_model = model[1] 
        dimentionality_reduction_name = selector_model['dimentionality_reduction'][0]
        dimentionality_reduction_algorithm = self.format_algorithm_string(selector_name)
        dimentionality_reduction_parameters = selector_model['dimentionality_reduction'][1]
        

        print(f'Reducing dimensions with {dimentionality_reduction_name} for {key}')
        dimensionality_reducer = self.create_feature_selection_model(dimentionality_reduction_algorithm, dimentionality_reduction_parameters)
        
        X_train_reduced = dimensionality_reducer.fit_transform(X_train)
        X_test_reduced = dimensionality_reducer.transform(X_test)
        
        return X_train_reduced, X_test_reduced

In [53]:
class Build(Include):
    
    def __init__(self, test_dict, feature_selection=False, dimentionality_reduction=False):
        
        self.test_dict = test_dict
        self.test_dict['predictions'] = {}
        self.test_dict['models'] = {}
        self.test_dict['X_test'] = {}
        self.feature_selection = feature_selection
        self.dimensionality_reduction = dimentionality_reduction
        
    def build_regression_models(self, models_list, dependent_variable):
              
        for key, data in self.test_dict['data'].items():
            for model in models_list:
                
                X_train, X_test, y_train, y_test = self.preprocess_test_data(data, dependent_variable)
                
                regression_model = model[0]
                model_name = regression_model['model']
                algorithm = self.format_algorithm_string(model_name)
                parameters = regression_model['parameters']
                
                regressor = self.create_regression_model(algorithm , parameters)
                
                if self.feature_selection: 
                    X_train_selected, X_test_selected = self.include_feature_selection(model, key, regressor, X_train, X_test, y_train)
                    X_train = X_train_selected
                    X_test = X_test_selected
                    
                if self.dimensionality_reduction:
                    X_train_reduced, X_test_reduce = self.include_dimensionality_reduction(model, key, X_train, X_test, y_train)
                    X_train = X_train_reduced
                    X_test = X_test_reduced

                print(f'Training regression model {model_name} for {key}')
                regressor.fit(X_train, y_train)
                print(f'Training done!')
                predictions = regressor.predict(X_test)
                print()
                
                self.test_dict['models'][key+model_name] = regressor
                self.test_dict['predictions'][key+model_name] = predictions
                self.test_dict['X_test'][key] = X_test
                
        
        self.test_dict['y_test'] = y_test

    def build_classification_models(self, models_list, dependent_variable):
              
        for key, data in self.test_dict['data'].items():
            for model in models_list:

                X_train, X_test, y_train, y_test = self.preprocess_test_data(data, dependent_variable)

                model_name = model[0]
                algorithm = self.format_algorithm_string(model_name)
                parameters = model[1]
                
                classifier = self.create_classification_model(algorithm , parameters)
                
                if self.feature_selection: 
                    X_train_selected, X_test_selected = self.include_feature_selection(model, key, classifier, X_train, X_test, y_train)
                    X_train = X_train_selected
                    X_test = X_test_selected
                    
                if self.dimensionality_reduction:
                    X_train_reduced, X_test_reduce = self.include_dimensionality_reduction(model, key, X_train, X_test, y_train)
                    X_train = X_train_reduced
                    X_test = X_test_reduced

                print(f'Training classification model {model_name} for {key}')
                classifier.fit(X_train, y_train)
                print(f'Training done!')
                predictions = classifier.predict(X_test)
                print()

                self.test_dict['models'][key+model_name] = classifier
                self.test_dict['predictions'][key+model_name] = predictions
                self.test_dict['X_test'][key] = X_test

                
        self.test_dict['y_test'] = y_test

In [46]:
models = [[{'model': 'LR1', 'parameters': {}}, {'feature_selection': ['SFM1', {}], 'dimentionality_reduction': ['PCA1', {}]}],
          [{'model': 'RFR1', 'parameters': {'n_estimators': 20, 'criterion':'squared_error'}}, 
           {'feature_selection': ['RFE1', {'n_features_to_select': 3, 'step':1}]}]
         ]   

In [None]:
if len(models[0]) == 2:
    if 'feature_selection' in models[0][1]:
        
    if 'dimentionality_reduction' in models[0][1]:

In [6]:
df = pd.read_csv('./data/Clean_Dataset.csv')

df['path'] = df['source_city'] + ' ' + df['destination_city']
df['time'] = df['departure_time'] + ' ' + df['arrival_time']
df.replace({'Economy':0,'Business':1}, inplace=True)
df = df.drop(['Unnamed: 0','flight','source_city','destination_city','departure_time','arrival_time'], axis=1)

days_left_ranges = generate.generate_numerical_ranges(df.days_left, [2,5,10,18])
days_left_dict = categorize.categorize_numerical_variable(df.days_left, days_left_ranges)
df['days_left_category'] = df['days_left'].map(days_left_dict)

start = df.groupby('path').mean()['price'].min()
finish = df.groupby('path').mean()['price'].max()
path_ranges = generate.generate_categorical_ranges(3, start, finish)


path_group = df.groupby('path').mean()['price']
path_dict = categorize.categorize_categorical_variable(path_group, path_ranges)
df['path_category'] = df['path'].map(path_dict)

duration_ranges = generate.generate_numerical_ranges(df.duration, [5,18,28])
duration_dict = categorize.categorize_numerical_variable(df.duration, duration_ranges)
df['duration_category']=df['duration'].map(duration_dict)

start = df.groupby('time').mean()['price'].min()
finish = df.groupby('time').mean()['price'].max()
time_ranges = generate.generate_categorical_ranges(3, start, finish)


time_group = df.groupby('time').mean()['price']
time_dict = categorize.categorize_categorical_variable(time_group, time_ranges)
df['time_category'] = df['time'].map(time_dict)

In [7]:
df.drop(['days_left','path','duration','time'], axis=1, inplace=True)

In [47]:
all_regression_test_dict = {}

In [48]:
all_regression_test_dict['data'] = {'A1':df}

In [49]:
build = Build(all_regression_test_dict, feature_selection=True)

In [50]:
build.build_regression_models(models, 'price')

Selecting features with SFM1 for A1
Training regression model LR1 for A1
Training done!

Selecting features with RFE1 for A1
Training regression model RFR1 for A1
Training done!



In [38]:
all_test_tables_dict = assemble.assemble_test_tables( all_regression_test_dict)

In [40]:
all_regression_test_dict['test_tables'] = all_test_tables_dict
assemble.assemble_error_values(all_regression_test_dict)

Unnamed: 0,MEPE,MPE,MEAE,MAE,MSE,RMSE,NRMSE,STD
A1LR1,20.776,35.326,2158.929,4437.361,56711490.0,7530.703,0.066291,22612.54
A1RFR1,25.525,37.096,2501.62,4038.184,37304810.0,6107.766,0.053766,22612.54


In [169]:
build = Build(all_regression_test_dict)

In [170]:
build.build_regression_models(models, 'price')

Training regression model LR1 for A1
Training done!

Training regression model RFR1 for A1
Training done!



In [171]:
all_test_tables_dict = assemble.assemble_test_tables( all_regression_test_dict)
all_regression_test_dict['test_tables'] = all_test_tables_dict
assemble.assemble_error_values(all_regression_test_dict)

Unnamed: 0,MEPE,MPE,MEAE,MAE,MSE,RMSE,NRMSE,STD
A1LR1,25.709,45.028,2964.628,4425.619,43763660.0,6615.411,0.058234,22612.54
A1RFR1,14.546,21.544,1419.328,2942.615,25331770.0,5033.067,0.044305,22612.54


In [51]:
def build_feature_selected_regression_models(self, models_list, dependent_variable):

    for key, data in self.test_dict['data'].items():

        for model in models_list:

            model_name = model[0]
            parameters = model[1]
            algorithm = self.format_algorithm_string(model_name)
            selector_name = model[2]
            selector_algorithm = self.format_algorithm_string(selector_name)
            selector_parameters = model[3]

            X, y, _ = self.preprocess_data(data, dependent_variable)

            regressor = self.create_regression_model(algorithm , parameters)

            print(f'Selecting features with {selector_name} for {key}')
            feature_selector = self.create_feature_selection_model(regressor, selector_algorithm, selector_parameters)

            X_selected = feature_selector.fit_transform(X, y)

            X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=0)

            print(f'Training regression model {model_name} for {key}')
            regressor.fit(X_train, y_train)
            print(f'Training done!')
            predictions = regressor.predict(X_test)
            print()

            self.test_dict['models'][key+model_name+selector_name] = regressor
            self.test_dict['predictions'][key+model_name] = predictions
            self.test_dict['X_test'][key] = X_test

    self.test_dict['y_test'] = y_test

# Before - After Split

In [200]:
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [255]:
X = df.drop('price', axis=1)
y = df.price

X = pd.get_dummies(X, drop_first=True)

X1, X2, y1, y2 = train_test_split(X, y, test_size=0.2)
rfr = RandomForestRegressor()

In [248]:
len(df) *.8

240122.40000000002

## Before

In [249]:
feature_selector = SelectFromModel(rfr)
X1_selected = feature_selector.fit_transform(X1, y1)

In [250]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_selected, y1, test_size=0.2, random_state=0)

In [251]:
rfr = RandomForestRegressor(n_estimators=50, random_state=0)
rfr.fit(X1_train, y1_train)

In [252]:
X2_selected = feature_selector.transform(X2)

In [253]:
preds = rfr.predict(X2_selected)

In [254]:
mse = round(mean_squared_error(y2, preds), 3)
rmse = round(np.sqrt(mse), 3)
print('RMSE: ',rmse)

RMSE:  7892.805


## After

In [256]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=0)

In [257]:
feature_selector = SelectFromModel(rfr)
X1_train_selected = feature_selector.fit_transform(X1_train, y1_train)

In [258]:
rfr = RandomForestRegressor(n_estimators=50, random_state=0)
rfr.fit(X1_train_selected, y1_train)

In [259]:
X2_selected = feature_selector.transform(X2)

In [260]:
preds = rfr.predict(X2_selected)

In [261]:
mse = round(mean_squared_error(y2, preds), 3)
rmse = round(np.sqrt(mse), 3)
print('RMSE: ',rmse)

RMSE:  7879.047
