In [1]:
from utils.train import Create, Build
from utils.process import Format, Preprocess, Categorize, Assemble
from utils.generate import Generate
from utils.test import Test
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
generate = Generate()
categorize = Categorize()
assemble = Assemble()
test = Test()

# Regression

In [8]:
models = [[{'model': 'LR1', 'parameters': {}}, {'feature_selection': ['SFM1', {}], 'dimensionality_reduction': ['PCA1', {}]}],
          [{'model': 'RFR1', 'parameters': {'n_estimators': 20, 'criterion':'squared_error'}}, 
           {'feature_selection': ['RFE1', {'n_features_to_select': 3, 'step':1}], 'dimensionality_reduction': ['PCA1', {}]}]
         ]   

In [9]:
df = pd.read_csv('./data/Clean_Dataset.csv')

df['path'] = df['source_city'] + ' ' + df['destination_city']
df['time'] = df['departure_time'] + ' ' + df['arrival_time']
df.replace({'Economy':0,'Business':1}, inplace=True)
df = df.drop(['Unnamed: 0','flight','source_city','destination_city','departure_time','arrival_time'], axis=1)

days_left_ranges = generate.generate_numerical_ranges(df.days_left, [2,5,10,18])
days_left_dict = categorize.categorize_numerical_variable(df.days_left, days_left_ranges)
df['days_left_category'] = df['days_left'].map(days_left_dict)

start = df.groupby('path').mean()['price'].min()
finish = df.groupby('path').mean()['price'].max()
path_ranges = generate.generate_categorical_ranges(3, start, finish)


path_group = df.groupby('path').mean()['price']
path_dict = categorize.categorize_categorical_variable(path_group, path_ranges)
df['path_category'] = df['path'].map(path_dict)

duration_ranges = generate.generate_numerical_ranges(df.duration, [5,18,28])
duration_dict = categorize.categorize_numerical_variable(df.duration, duration_ranges)
df['duration_category']=df['duration'].map(duration_dict)

start = df.groupby('time').mean()['price'].min()
finish = df.groupby('time').mean()['price'].max()
time_ranges = generate.generate_categorical_ranges(3, start, finish)


time_group = df.groupby('time').mean()['price']
time_dict = categorize.categorize_categorical_variable(time_group, time_ranges)
df['time_category'] = df['time'].map(time_dict)

df.drop(['days_left','path','duration','time'], axis=1, inplace=True)

In [9]:
all_regression_test_dict = {}
all_regression_test_dict['data'] = {'A1':df}

In [10]:
build = Build(all_regression_test_dict, feature_selection=True, dimensionality_reduction=True)
build.build_regression_models(models, 'price')

Selecting features with SFM1 for A1
Reducing dimensions with PCA1 for A1
Training regression model LR1 for A1
Training done!

Selecting features with RFE1 for A1
Reducing dimensions with PCA1 for A1
Training regression model RFR1 for A1
Training done!



In [11]:
all_test_tables_dict = assemble.assemble_test_tables( all_regression_test_dict)
all_regression_test_dict['test_tables'] = all_test_tables_dict
assemble.assemble_error_values(all_regression_test_dict)

Unnamed: 0,MEPE,MPE,MEAE,MAE,MSE,RMSE,NRMSE,STD
A1LR1SFM1PCA1,20.776,35.326,2158.929,4437.361,56711490.0,7530.703,0.066291,22612.54
A1RFR1RFE1PCA1,25.54,37.092,2499.899,4037.789,37304250.0,6107.721,0.053765,22612.54


In [16]:
all_regression_test_dict['predictions']

{'A1LR1SFM1': array([ 8020.6083135 , 51247.63261611,  8020.6083135 , ...,
         5331.07079845,  8020.6083135 ,  5331.07079845]),
 'A1RFR1RFE1': array([ 6781.19851301, 57826.12632473,  4079.55203241, ...,
         7551.78385336,  6781.19851301,  4011.61191443])}

# Classification

In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

In [4]:
iris = load_iris()

In [5]:
dfc = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])

In [6]:
models = [[{'model': 'LR1', 'parameters': {}}, {'feature_selection': ['SFM1', {}], 'dimensionality_reduction': ['PCA1', {}]}],
          [{'model': 'RFC1', 'parameters': {'n_estimators': 20, 'criterion':'gini'}}, 
           {'feature_selection': ['RFE1', {'n_features_to_select': 3, 'step':1}], 'dimensionality_reduction': ['PCA1', {}]}]
         ]   

In [7]:
all_classification_test_dict = {}
all_classification_test_dict['data'] = {'A1':dfc}

In [8]:
build = Build(all_classification_test_dict, feature_selection=True, dimensionality_reduction=True)
build.build_classification_models(models, 'target')

Selecting features with SFM1 for A1
Reducing dimensions with PCA1 for A1
Training classification model LR1 for A1
Training done!

Selecting features with RFE1 for A1
Reducing dimensions with PCA1 for A1
Training classification model RFC1 for A1
Training done!



In [9]:
all_classification_test_dict['predictions']

{'A1LR1SFM1PCA1': array([2., 1., 0., 2., 0., 2., 0., 1., 1., 1., 2., 1., 1., 1., 1., 0., 1.,
        1., 0., 0., 2., 1., 0., 0., 2., 0., 0., 1., 1., 0.]),
 'A1RFC1RFE1PCA1': array([2., 1., 0., 2., 0., 2., 0., 1., 1., 1., 2., 1., 1., 1., 1., 0., 1.,
        1., 0., 0., 2., 1., 0., 0., 2., 0., 0., 1., 1., 0.])}

In [10]:
assemble.assemble_classification_report(all_classification_test_dict)

Unnamed: 0,CLASS,PRECISION,RECALL,F1-SCORE,SUPPORT,ACCURACY
A1LR1SFM1PCA1,0.0,1.0,1.0,1.0,11,1.0
A1LR1SFM1PCA1,1.0,1.0,1.0,1.0,13,1.0
A1LR1SFM1PCA1,2.0,1.0,1.0,1.0,6,1.0
A1RFC1RFE1PCA1,0.0,1.0,1.0,1.0,11,1.0
A1RFC1RFE1PCA1,1.0,1.0,1.0,1.0,13,1.0
A1RFC1RFE1PCA1,2.0,1.0,1.0,1.0,6,1.0
