In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [2]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Create dataset

In [4]:
# Definir el tamaño del dataset
n_dataset = 100000

In [7]:
# Create date
date_list=[datetime.today()- relativedelta(months = x) for x in range(50,-1,-1)]
month_list=[datetime.strftime(x,'%Y%m') for x in date_list]

n = len(month_list)
array = np.random.choice(range(5,10,1), size=n)
array = array/sum(array)
date_data = np.random.choice(month_list, size=n_dataset, p=array)

In [8]:
# Create client_id
unique_client_id = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
client_id_data = np.random.choice(unique_client_id, size=n_dataset)

In [9]:
df = pd.DataFrame({'date': date_data,
                   'client_id': client_id_data,
                   'Y': np.random.choice([1,0], size=n_dataset, p=[0.05,0.95]), # If purchased the product or not
                   'X1': np.random.choice(range(10), size=n_dataset),
                   'X2': np.random.choice(range(100), size=n_dataset),
                   'X3': np.random.choice(range(-10,10,1), size=n_dataset)
})
df.head(3)

Unnamed: 0,date,client_id,Y,X1,X2,X3
0,202112,n,0,5,44,-9
1,201910,u,0,4,29,9
2,202006,x,0,7,42,6


# Para utilizar estos custom-time-series-split:

- Los datos deben estar preprocesados y listos para el entrenamiento (considerar NaNs)
- La data debe tener una fecha

### Custom time-series split: Ventana móvil con tamaño FIJO

Ejemplo:

`TRAIN:  ['201801', '201802', '201803'] TEST:  ['201804']` <br>
`TRAIN:  ['201802', '201803', '201804'] TEST:  ['201805']` <br>
`TRAIN:  ['201803', '201804', '201805'] TEST:  ['201806']` <br>
`TRAIN:  ['201804', '201805', '201806'] TEST:  ['201807']` <br>
`TRAIN:  ['201805', '201806', '201807'] TEST:  ['201808']` <br>
`TRAIN:  ['201806', '201807', '201808'] TEST:  ['201809']` <br>
`TRAIN:  ['201807', '201808', '201809'] TEST:  ['201810']` <br>
`TRAIN:  ['201808', '201809', '201810'] TEST:  ['201811']` <br>

In [10]:
data = df.sort_values("date").reset_index(drop=True).copy() # Toma los índices, importante resetearlos después de ordenarlos
month_list = data["date"].drop_duplicates().tolist() # ['2021-01-01',  '2021-02-01',  '2021-03-01', .. ]

splits = {'train': [], 'test': []}
n = 3 # Tamaño de la ventana móvil

for idx, yr in enumerate(month_list[:-n]):
    train_mth = month_list[idx:idx+n]
    test_mth = [month_list[idx+n]]
    print('TRAIN: ', train_mth, 'TEST: ',test_mth)
    
    splits['train'].append(data.loc[data.date.isin(train_mth), :])
    splits['test'].append(data.loc[data.date.isin(test_mth), :])

TRAIN:  ['201801', '201802', '201803'] TEST:  ['201804']
TRAIN:  ['201802', '201803', '201804'] TEST:  ['201805']
TRAIN:  ['201803', '201804', '201805'] TEST:  ['201806']
TRAIN:  ['201804', '201805', '201806'] TEST:  ['201807']
TRAIN:  ['201805', '201806', '201807'] TEST:  ['201808']
TRAIN:  ['201806', '201807', '201808'] TEST:  ['201809']
TRAIN:  ['201807', '201808', '201809'] TEST:  ['201810']
TRAIN:  ['201808', '201809', '201810'] TEST:  ['201811']
TRAIN:  ['201809', '201810', '201811'] TEST:  ['201812']
TRAIN:  ['201810', '201811', '201812'] TEST:  ['201901']
TRAIN:  ['201811', '201812', '201901'] TEST:  ['201902']
TRAIN:  ['201812', '201901', '201902'] TEST:  ['201903']
TRAIN:  ['201901', '201902', '201903'] TEST:  ['201904']
TRAIN:  ['201902', '201903', '201904'] TEST:  ['201905']
TRAIN:  ['201903', '201904', '201905'] TEST:  ['201906']
TRAIN:  ['201904', '201905', '201906'] TEST:  ['201907']
TRAIN:  ['201905', '201906', '201907'] TEST:  ['201908']
TRAIN:  ['201906', '201907', '2

### Custom time-series split: Ventana móvil con tamaño VARIABLE hasta un threshold de tamaño

Ejemplo:

`TRAIN:  ['201801'] TEST:  ['201802']`<br>
`TRAIN:  ['201801', '201802'] TEST:  ['201803']`<br>
`TRAIN:  ['201801', '201802', '201803'] TEST:  ['201804']`<br>
`TRAIN:  ['201801', '201802', '201803', '201804'] TEST:  ['201805']`<br>
`TRAIN:  ['201802', '201803', '201804', '201805'] TEST:  ['201806']`<br>
`TRAIN:  ['201803', '201804', '201805', '201806'] TEST:  ['201807']`<br>

In [11]:
data = df.sort_values("date").reset_index(drop=True).copy() # Toma los índices, importante resetearlos después de ordenarlos
month_list = data["date"].drop_duplicates().tolist() # ['2021-01-01',  '2021-02-01',  '2021-03-01', .. ]

splits = {'train': [], 'test': []}
n = 1 # Tamaño inicial de la ventana móvil
threshold = 4 # Tamaño de la ventana fija

for idx, yr in enumerate(month_list[:-n]):
    
    if idx + n <= threshold:
        train_mth = month_list[:idx+n]
        test_mth = [month_list[idx+n]]
        print('TRAIN: ', train_mth, 'TEST: ',test_mth)

    else:
        train_mth = month_list[idx-threshold+n:idx+n]
        test_mth = [month_list[idx+n]]
        #print(idx, n)
        print('TRAIN: ', train_mth, 'TEST: ',test_mth)

    splits['train'].append(data.loc[data.date.isin(train_mth), :])
    splits['test'].append(data.loc[data.date.isin(test_mth), :])

TRAIN:  ['201801'] TEST:  ['201802']
TRAIN:  ['201801', '201802'] TEST:  ['201803']
TRAIN:  ['201801', '201802', '201803'] TEST:  ['201804']
TRAIN:  ['201801', '201802', '201803', '201804'] TEST:  ['201805']
TRAIN:  ['201802', '201803', '201804', '201805'] TEST:  ['201806']
TRAIN:  ['201803', '201804', '201805', '201806'] TEST:  ['201807']
TRAIN:  ['201804', '201805', '201806', '201807'] TEST:  ['201808']
TRAIN:  ['201805', '201806', '201807', '201808'] TEST:  ['201809']
TRAIN:  ['201806', '201807', '201808', '201809'] TEST:  ['201810']
TRAIN:  ['201807', '201808', '201809', '201810'] TEST:  ['201811']
TRAIN:  ['201808', '201809', '201810', '201811'] TEST:  ['201812']
TRAIN:  ['201809', '201810', '201811', '201812'] TEST:  ['201901']
TRAIN:  ['201810', '201811', '201812', '201901'] TEST:  ['201902']
TRAIN:  ['201811', '201812', '201901', '201902'] TEST:  ['201903']
TRAIN:  ['201812', '201901', '201902', '201903'] TEST:  ['201904']
TRAIN:  ['201901', '201902', '201903', '201904'] TEST: 

In [12]:
custom_cv = []

for FOLD_train,FOLD_test in zip(splits['train'],splits['test']):
    custom_cv.append((np.array(FOLD_train.index.values.tolist()),np.array(FOLD_test.index.values.tolist())))

In [19]:
# Para visualizar
splits['train'][0].head(3)

Unnamed: 0,date,client_id,Y,X1,X2,X3
0,201801,p,0,5,90,7
1,201801,o,0,7,14,9
2,201801,w,0,0,78,-7


In [20]:
# Para visualizar
splits['train'][0]['date'].value_counts()

201801    2208
Name: date, dtype: int64

In [21]:
# Para visualizar
splits['train'][1]['date'].value_counts()

201802    2411
201801    2208
Name: date, dtype: int64

In [22]:
# Para visualizar
splits['train'][2]['date'].value_counts()

201802    2411
201803    2250
201801    2208
Name: date, dtype: int64

In [32]:
# Para visualizar
for i in range(10):
    print('train:', splits['train'][i]['date'].value_counts())
    print('test:', splits['test'][i]['date'].value_counts())
    print("")

train: 201801    2208
Name: date, dtype: int64
test: 201802    2411
Name: date, dtype: int64

train: 201802    2411
201801    2208
Name: date, dtype: int64
test: 201803    2250
Name: date, dtype: int64

train: 201802    2411
201803    2250
201801    2208
Name: date, dtype: int64
test: 201804    2162
Name: date, dtype: int64

train: 201802    2411
201803    2250
201801    2208
201804    2162
Name: date, dtype: int64
test: 201805    1404
Name: date, dtype: int64

train: 201802    2411
201803    2250
201804    2162
201805    1404
Name: date, dtype: int64
test: 201806    1714
Name: date, dtype: int64

train: 201803    2250
201804    2162
201806    1714
201805    1404
Name: date, dtype: int64
test: 201807    2458
Name: date, dtype: int64

train: 201807    2458
201804    2162
201806    1714
201805    1404
Name: date, dtype: int64
test: 201808    1637
Name: date, dtype: int64

train: 201807    2458
201806    1714
201808    1637
201805    1404
Name: date, dtype: int64
test: 201809    2511
Name

In [289]:
X_cols = data.drop(['date', 'client_id', 'Y'], axis=1).columns

In [290]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

dict_classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    #"Linear SVM": SVC(),
    #"XGB": XGBRegressor(),
    #"Logistic Regression": LogisticRegression(),
    #"Nearest Neighbors": KNeighborsClassifier(),
    #"Decision Tree": DecisionTreeClassifier(),
}

params = {
    "Random Forest": {"max_depth": [3,4], 
                      "min_samples_leaf": [5],
                      "n_estimators": [100]},

    "Gradient Boosting Classifier": {"learning_rate": [0.01], 
                                     "n_estimators": [500],
                                     },
    
    #"Linear SVM": {"kernel": ["rbf", "poly"], "gamma": ["auto", "scale"], "degree": range(1, 4, 1)},
    #"XGB": {'min_child_weight': [1, 5, 10],
    #        'gamma': [0.1, 1, 1.5, 2, 5],
    #        'subsample': [0.6, 0.8, 1.0],
    #        'colsample_bytree': [0.6, 0.8, 1.0],
    #        'max_depth': [3, 4, 5], 
    #        "n_estimators": [300, 600],
    #        "learning_rate": [0.001, 0.01, 0.1],
    #        },
    #"Logistic Regression": {'penalty': ['none', 'l2'], 
    #                        'C': [1]},
    #"Nearest Neighbors": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    #"Decision Tree": {'criterion': ['gini', 'entropy'], 'max_depth': np.arange(3, 15)},
}

for classifier_name in dict_classifiers.keys() & params:

    print("training: ", classifier_name)
    gridSearch = GridSearchCV(
        estimator=dict_classifiers[classifier_name], 
        param_grid=params[classifier_name], 
        cv=custom_cv, 
        verbose=2,
        n_jobs=-1
    )
    
    gridSearch.fit(data[X_cols].to_numpy(), # shoud have shape of (n_samples, n_features) 
                   data[['Y']].to_numpy().reshape((-1))) #this should be an array with shape (n_samples,)
    print(gridSearch.best_score_, gridSearch.best_params_)

training:  Gradient Boosting Classifier
Fitting 50 folds for each of 1 candidates, totalling 50 fits
0.9508799863444023 {'learning_rate': 0.01, 'n_estimators': 500}
training:  Random Forest
Fitting 50 folds for each of 2 candidates, totalling 100 fits
0.9509445931410814 {'max_depth': 3, 'min_samples_leaf': 5, 'n_estimators': 100}
[CV] END ...............learning_rate=0.01, n_estimators=500; total time=   0.0s
[CV] END ...............learning_rate=0.01, n_estimators=500; total time=   0.0s
[CV] END ...............learning_rate=0.01, n_estimators=500; total time=   0.0s
[CV] END ...............learning_rate=0.01, n_estimators=500; total time=   0.0s
[CV] END ...............learning_rate=0.01, n_estimators=500; total time=   0.0s
[CV] END ...............learning_rate=0.01, n_estimators=500; total time=   0.0s
[CV] END ...............learning_rate=0.01, n_estimators=500; total time=   0.0s
[CV] END ...............learning_rate=0.01, n_estimators=500; total time=   0.0s
[CV] END ...........

In [47]:
gridSearch.scorer_

<function sklearn.metrics._scorer._passthrough_scorer(estimator, *args, **kwargs)>

In [48]:
gridSearch.best_estimator_

RandomForestClassifier(max_depth=4, min_samples_leaf=5)