# GENERATE CONFIG.JSON TO RUN SCORER

This notebook was meant to briefly explain each parameter or give guidance to learn more about it and at the end generate the file necessarily to run the app.

### About the Project

<center><img src="best_scorer.jpg" alt="Drawing" style="width: 200px;" class="center"/></center>

This is a simple cookiecutter for development of propension models using pyspark.

In [None]:
import json
import os
import pathlib

config = {}
config_name = 'config.json'

user = os.environ['USER']
project = 'ProjectName'
alias = 'r9'
# HDFS folder to save outputs of project
hdfs_path = f'/user/{user}/{project}'
curr_dir = str(pathlib.Path().resolve())
# Databases to use
schema_in = 'DataBaseiN'
schema_out = 'DataBaseOut'
tables_in = ['tables']

In [None]:
# MANAGING PARAMETERS
config['project'] = project   # Project name: Most common name of project, e.g: churn_tarj_deb
# Stage of development [DEV or PROD] if, PROD, intermediates tables will be erased in PROD
config['mode'] = 'ExecutionType'    
config['alias'] = alias   # Alias to identify a different set of parameters

In [None]:
# SPARK PARAMETERS
config['size'] = 'SizeSparkSession'   # Parameters of spark session
config['new_part_cols'] = ['NewColumToPartition']   # New column to partition 

In [None]:
# SOURCE AND OUTPUT PARAMETERS
# TABLES
config['schema'] = schema_out
config['prod_path'] = f'{hdfs_path}/prod/'
config['table_sources'] = {}
config['table_outputs'] = {}
for i, table in enumerate(tables_in):
    config['table_sources'][f'clean_{i}'] = f'{schema_in}.{table}'
    config['table_outputs'][f'clean_{i}'] = f'{project}_cleaned_{i}'
config['table_sources']['spine'] = f'{schema_in}.potentialsTable'
config['table_sources']['spine_prod'] = f'{schema_in}.potentialsTableProd'
config['table_outputs']['spine'] = f'{project}_spine'
config['table_outputs']['spine_prod'] = f'{project}_master_prod'
config['table_outputs']['dataset'] = f'{project}_dataset'
config['table_outputs']['pred'] = f'{project}_pred_{alias}'
config['table_outputs']['score'] = f'{project}_score_{alias}'
config['table_outputs']['prod'] = f'{project}_score_prod'
# FILES
config['outputs'] = {}
config['outputs']['prior'] = f'{hdfs_path}/prior'
config['outputs']['feat_sel'] = {}
config['outputs']['feat_sel']['infogain'] = f'{hdfs_path}/feat_sel/infogain'
config['outputs']['feat_sel']['chisqrd'] = f'{hdfs_path}/feat_sel/chisqrd'
config['outputs']['feat_sel']['cramer'] = f'{hdfs_path}/feat_sel/cramer'
config['outputs']['feat_sel']['mtinfo'] = f'{hdfs_path}/feat_sel/mtinfo'
config['outputs']['feat_sel']['chosen'] = f'{hdfs_path}/feat_sel/chosen'
config['outputs']['model'] = f'{hdfs_path}/model_{alias}'
config['outputs']['assembler'] = f'{hdfs_path}/assembler_{alias}'
config['outputs']['idx_fit'] = f'{hdfs_path}/idx_fit_{alias}'
config['outputs']['ohe'] = f'{hdfs_path}/ohe_{alias}'
config['outputs']['ohe_fit'] = f'{hdfs_path}/ohe_fit_{alias}'
config['outputs']['feat_import'] = f'{hdfs_path}/feat_import_{alias}'
config['outputs']['metrics'] = f'{hdfs_path}/metrics_{alias}'
config['outputs']['ks_train'] = f'{hdfs_path}/ks_train_{alias}'
config['outputs']['ks_test'] = f'{hdfs_path}/ks_test_{alias}'
config['outputs']['ks_val'] = f'{hdfs_path}/ks_val_{alias}'

In [None]:
# FEATURES PARAMETERS
# features names to drop from the very beginning
config['feat_drop'] = []

In [None]:
# DATE PARAMETERS
# Reference date and repective format used
config['ini_date'] = ['Date', 'DateFormat']
# qty of months ahead of reference date to use
config['qty_months'] = 0

In [None]:
# MODELING PARAMETERS
config['conf'] = {}
# Qty of columns to execute by time by the decorator 
# when running certains functions like countDistinct
config['conf']['step_size'] = 200
# Number of months to use as validation
# pick always the lasts
config['conf']['n_months_val'] = 1
# Size of train without sampling
config['conf']['train_size'] = 0.7
# Prior by month to use in sampling
config['conf']['prior_map'] = None
# Min prior by month to use when sampling
config['conf']['min_prior'] = None
# Factor that multiples the original or given prior
# when sampling
config['conf']['factor_mult'] = 1
# Shuffle splitted data
config['conf']['shuffle'] = False
# Persists at every frequency given when joining splited data
config['conf']['cache_freq'] = 5
# Identifier Columns
config['conf']['spine'] = {'target_col': 'target',
                           'artf_id_col': 'id',
                           'ident_col': 'cliident',
                           'date_col': 'data_date_part',
                           'ind_split_col': 'ind_split'}
# Threshold used to drop low variance columns
# When using drop_mode_threshold
config['conf']['null_threshold'] = 0.95
# When using rmv_num_low_vars
config['conf']['varc_threshold'] = 5
# Binarys variables to input "sin_info" instead of 0
config['conf']['bin_input'] = []
# https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.approx_count_distinct.html
config['conf']['rsd'] = 0.015
# https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.functions.var_pop.html
# https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.var_samp.html
config['conf']['population'] = False
# Used in repartition
config['conf']['n_part'] = 100
# FEATURE SELECTION PARAMETERS
# Which feature selection type to use, option are:
# infogain (feature importance of base model),
# chisqrd (https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.ChiSqSelector.html),
# cramer (Cramer's V)
# mtinfo (mutual information)
config['conf']['feat_mode'] = 'infogain'
# Options are:
# top: Keeps the top x variables
# percentile: For info gain keep all variables that sums up to x %
# threshold: Keep all vars above x
# perc: Keep x% of all variables
config['conf']['criterion'] = 'percentile'
config['conf']['top'] = 60
config['conf']['percentile'] = 0.8
config['conf']['threshold'] = 0.05
config['conf']['perc'] = 0.3

In [None]:
# SCORING PARAMETERS
# Names of columns
config['conf']['score_col'] = 'score'
config['conf']['buckets_name'] = 'deciles'

In [None]:
# EVALUATE PARAMETERS
config['conf']['eval'] = {}
# Error when calculating percentiles
config['conf']['eval']['error'] = 0
# How many groups to discretize
config['conf']['eval']['buckets'] = 10

In [None]:
# MAILING PARAMETERS
config['from'] = 'felipemonteiro@outlook.com'
config['to'] = ['felipemonteiro@outlook.com']
config['cc'] = []

In [None]:
# PY FILES PARAMETERS
config['py_file0'] = f'{curr_dir}/scorer.zip'
config['py_file1'] = '/path/package.egg'

In [None]:
# LOGGER PARAMETERS
config['logger_files'] = ['log.log', 'stdlog.log']

In [None]:
# REPORT FILES
config['rep_feat_sel'] = f'feature_selection_{project}_{alias}.xlsx'
config['rep_perfnce'] = f'performance_{project}_{alias}.xlsx'
config['rep_auditnum'] = f'{project}_auditnum'
config['rep_auditcat'] = f'{project}_auditcat'

### Gen File

In [None]:
config

In [None]:
with open(config_name, 'w', encoding='utf-8') as f:
    json.dump(config, f, ensure_ascii=False, indent=4)