In [6]:
import pandas as pd
import joblib
import importlib
from pathlib import Path

#Import package modules
import preprocess 
from fmodel import FraudModel

#Set up logging
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s-%(levelname)s-%(name)s: %(message)s", datefmt='%d-%b-%y %H:%M:%S')

logger=logging.getLogger(__name__)
logger.setLevel(logging.INFO)



In [24]:
def get_data_from_csv(data_dir, filename_pat_list):
    """
    Function to transform csv files to dataframe
    
    Parameters:
    -----------
    data_dir(str): Directory where csv files are placed of the data to be read
    filename_pat_list(list): List of patterns of filenames to be read like ['data*.csv','dec*.csv']
    
    Returns:
    -----------
    Dataframe object
    
    """
    
    path=Path(data_dir)
    
    new_data_files=[]
    for pat in filename_pat_list:
        new_data_files += sorted(path.glob(pat))
        
    out_df=pd.DataFrame()
    for file in new_data_files:
        f=pd.read_csv(file)
        out_df=out_df.append(f)
    
    logger.info(f'{len(new_data_files)} files read')
    logger.info(f'Resulting dataframe has shape: {out_df.shape}')  
    
    return out_df    

def train_model(data_dir, filename_pat_list, new_enc_path='', fname_for_new_model='', samples_each_class=None, valid_pct=0.33, test_threshold=0.7, test_mode='N'):    
    in_df=get_data_from_csv(data_dir, filename_pat_list)
    if test_mode=='Y':
        in_df=in_df[:500]
    
    in_df=preprocess.preprocess(in_df, cat_encoder=None, enc_path=new_enc_path)

    model=FraudModel()
    model.train(in_df, samples_each_class=samples_each_class, valid_pct=valid_pct, test_threshold=test_threshold, fname=fname_for_new_model)    
    
def ask_model(data_dir, filename_pat_list, cat_encoder_path, model_file, test_mode='N'):
    in_df=get_data_from_csv(data_dir, filename_pat_list)
    if test_mode=='Y':
        in_df=in_df[:50]
    
    cat_encoder=joblib.load(cat_encoder_path)
    in_df=preprocess.preprocess(in_df, cat_encoder=cat_encoder)
    
    loaded_model=FraudModel.deserialize(model_file)
    return loaded_model.predict(in_df)    

In [26]:
ask_model('/home/gjain', ['dec*.csv'], '/home/gjain/ordinal_encoder.pkl', '/home/gjain/lightgbmv1.pkl')

  exec(code_obj, self.user_global_ns, self.user_ns)
18-Jun-19 13:39:36-INFO-__main__: 1 files read
18-Jun-19 13:39:36-INFO-__main__: Resulting dataframe has shape: (267726, 91)
18-Jun-19 13:39:39-INFO-preprocess: Data received. Creating features...
18-Jun-19 13:39:41-INFO-preprocess: Encoder provided
18-Jun-19 13:39:41-INFO-preprocess: Sum of null values is 0
18-Jun-19 13:39:41-INFO-fmodel: Model loaded from file /home/gjain/lightgbmv1.pkl
  X=in_df[all_cols].as_matrix()


0      Accept   
1      High Risk
2      Accept   
3      Accept   
4      Accept   
5      Accept   
6      Accept   
7      Accept   
8      Accept   
9      Accept   
10     Accept   
11     Accept   
12     High Risk
13     Accept   
14     Accept   
15     Accept   
16     High Risk
17     Accept   
18     Accept   
19     Accept   
20     Accept   
21     Accept   
22     Accept   
23     Accept   
24     Accept   
25     Accept   
26     Accept   
27     Accept   
28     Accept   
29     Accept   
30     Accept   
31     Accept   
32     Accept   
33     Accept   
34     High Risk
35     Accept   
36     Accept   
37     Accept   
38     Accept   
39     Accept   
40     Accept   
41     Accept   
42     High Risk
43     High Risk
44     Accept   
45     Accept   
46     Accept   
47     Accept   
48     Accept   
49     Accept   
50     High Risk
51     Accept   
52     Accept   
53     Accept   
54     Accept   
55     Accept   
56     Accept   
57     Accept   
58     High Ri

In [25]:
train_model('/home/gjain', ['dec*.csv'])

18-Jun-19 13:38:51-INFO-__main__: 1 files read
18-Jun-19 13:38:51-INFO-__main__: Resulting dataframe has shape: (267726, 91)
18-Jun-19 13:38:51-INFO-preprocess: Data received. Creating features...
18-Jun-19 13:38:53-INFO-preprocess: Creating encoder for training dataset
18-Jun-19 13:38:54-INFO-preprocess: Sum of null values is 0
18-Jun-19 13:38:54-INFO-fmodel: Run this only if you have the chargeback data for this time period available
18-Jun-19 13:38:54-INFO-fmodel: Treating class imbalance
  train=X.as_matrix()
18-Jun-19 13:38:57-INFO-fmodel: Training the model now
New categorical_feature is [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 52, 53, 54, 56, 57, 58, 65, 67, 68, 69, 71]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
18-Jun-19 13:38:57-INFO-fmod

[1]	valid_0's binary_logloss: 0.693164
Training until validation scores don't improve for 25 rounds.
[2]	valid_0's binary_logloss: 0.693164
[3]	valid_0's binary_logloss: 0.693164
[4]	valid_0's binary_logloss: 0.693164
[5]	valid_0's binary_logloss: 0.693164
[6]	valid_0's binary_logloss: 0.693164
[7]	valid_0's binary_logloss: 0.693164
[8]	valid_0's binary_logloss: 0.693164
[9]	valid_0's binary_logloss: 0.693164
[10]	valid_0's binary_logloss: 0.693164
[11]	valid_0's binary_logloss: 0.693164
[12]	valid_0's binary_logloss: 0.693164
[13]	valid_0's binary_logloss: 0.693164
[14]	valid_0's binary_logloss: 0.693164
[15]	valid_0's binary_logloss: 0.693164
[16]	valid_0's binary_logloss: 0.693164
[17]	valid_0's binary_logloss: 0.693164
[18]	valid_0's binary_logloss: 0.693164
[19]	valid_0's binary_logloss: 0.693164
[20]	valid_0's binary_logloss: 0.693164
[21]	valid_0's binary_logloss: 0.693164
[22]	valid_0's binary_logloss: 0.693164
[23]	valid_0's binary_logloss: 0.693164
[24]	valid_0's binary_loglo