In [1]:
import numpy  as np
import pandas as pd
# import seaborn as sns
import pandas_profiling
import seaborn as sns
import pickle
import multiprocessing
from typing import List, Dict, Any, Tuple, Optional

from os.path import join as pjoin
from utils import uplift_score, DATA_PATH, SUBMISSIONS_PATH

In [72]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split

from CTL.CTL import CausalTree

In [69]:
def round_df_column(df, column, round_to):
    df[column] = df[column] // round_to * round_to


def analyze_combs(df):
    combs = 1
    for col in df.columns:
        nunique = df[col].nunique()
        combs *= nunique
        # print(col, nunique)
    print('Combs: ', combs)
    

def get_feature_importances(est, columns):
    return pd.DataFrame({
        'column': columns,
        'importance': est.feature_importances_,
    }).sort_values('importance', ascending=False)

In [4]:
df_clients = pd.read_csv(pjoin(DATA_PATH, 'clients.csv'), index_col='client_id', parse_dates=['first_issue_date', 'first_redeem_date'])
df_train = pd.read_csv(pjoin(DATA_PATH, 'uplift_train.csv'), index_col='client_id')
df_test = pd.read_csv(pjoin(DATA_PATH, 'uplift_test.csv'), index_col='client_id')

In [5]:
# Извлечение признаков
MIN_DATETIME = df_clients['first_issue_date'].min()
SECONDS_IN_DAY = 60 * 60 * 24
df_clients['first_issue_unixtime'] = (df_clients['first_issue_date'] - MIN_DATETIME).dt.total_seconds() // SECONDS_IN_DAY
df_clients['first_redeem_unixtime'] = (df_clients['first_redeem_date'] - MIN_DATETIME).dt.total_seconds() // SECONDS_IN_DAY

In [6]:
df_features = pd.DataFrame({
    'gender_M': (df_clients['gender'] == 'M').astype(int),
    'gender_F': (df_clients['gender'] == 'F').astype(int),
    'gender_U': (df_clients['gender'] == 'U').astype(int),
    'age': df_clients['age'],
    'first_issue_time': df_clients['first_issue_unixtime'],
    'first_redeem_time': df_clients['first_redeem_unixtime'],
    'issue_redeem_delay': df_clients['first_redeem_unixtime'] - df_clients['first_issue_unixtime'],
}).fillna(0)
analyze_combs(df_features)

Combs:  1457271019200


In [7]:
df_features.loc[df_features['age'] < 0, 'age'] = -10
df_features.loc[df_features['age'] > 80, 'age'] = -20
round_df_column(df_features, 'age', 10)
round_df_column(df_features, 'first_issue_time', 30)
round_df_column(df_features, 'first_redeem_time', 30)
round_df_column(df_features, 'issue_redeem_delay', 30)

analyze_combs(df_features)

Combs:  2230272


In [8]:
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

In [9]:
X_train = df_features.loc[indices_learn, :].values
X_test = df_features.loc[indices_valid, :].values
X_valid = df_features.loc[indices_valid, :].values

treatment_train = df_train.loc[indices_learn, 'treatment_flg'].values
target_train = df_train.loc[indices_learn, 'target'].values

treatment_valid = df_train.loc[indices_valid, 'treatment_flg'].values
target_valid = df_train.loc[indices_valid, 'target'].values

In [88]:
%%time
y = target_train
w = treatment_train
z = y * w + (1 - y) * (1 - w)

dt = DecisionTreeClassifier(max_depth=5, random_state=1, min_samples_leaf=100)
dt.fit(X_train, z);

Wall time: 183 ms


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1, splitter='best')

In [89]:
get_feature_importances(dt, df_features.columns)

Unnamed: 0,column,importance
5,first_redeem_time,0.679684
3,age,0.188542
4,first_issue_time,0.082989
6,issue_redeem_delay,0.041321
0,gender_M,0.007465
1,gender_F,0.0
2,gender_U,0.0


In [76]:
export_graphviz(dt, out_file = 'dtree.dot', feature_names = df_features.columns)

In [79]:
from IPython.display import SVG
from graphviz import Source

In [90]:
graph = Source(export_graphviz(dt, out_file=None, feature_names = df_features.columns, filled=True))
# SVG(graph.pipe(format='svg'))

In [91]:
graph.format = 'png'
graph.render('dtree_render',view=True)

'dtree_render.png'

In [17]:
%%time
ctl = CausalTree(min_size=100, max_depth=3, seed=1)
ctl.fit(X_train, target_train, treatment_train)

Wall time: 1.7 s


In [18]:
%%time
ctl_predict = ctl.predict(X_valid)

Wall time: 96.7 ms


In [19]:
uplift_score(ctl_predict, treatment_valid, target_valid)

0.03218312235012133

In [20]:
def parallel(func, n_processes: int, all_args: List[tuple]) -> List[Any]:
    if n_processes > 1:
        with multiprocessing.Pool(n_processes) as p:
            result = p.starmap(func, all_args)
    else:
        print('parallel: one process')
        result = [func(*args) for args in all_args]
    return result

In [63]:
class UpliftRandomForestClassifier:
    def __init__(
            self,
            n_estimators: int = 100,
            # max_depth: int = 3,
            n_jobs: int = 1,
    ):
        self.n_estimators = n_estimators
        # self.max_depth = max_depth
        self.n_jobs = n_jobs

        self.trees = []
        self.samples_col_indices = []

    @staticmethod
    def _fit_tree(
        X: np.ndarray,
        target: np.ndarray,
        treatment: np.ndarray,
        seed: int,
    ) -> CausalTree:
        print('fit tree')
        ctl = CausalTree(min_size=100, max_depth=3, seed=seed)
        ctl.fit(X, target, treatment)
        return ctl

    def fit(self, X: np.ndarray, target: np.ndarray, treatment: np.ndarray):
        np.random.seed(1)
        
        n_rows, n_columns = X.shape
        n = n_rows
        m = 3
        all_row_indices = np.arange(n_rows)
        all_col_indices = np.arange(n_columns)

        all_args = []
        for i in range(self.n_estimators):
            row_indices = np.random.choice(all_row_indices, size=n,
                                           replace=True)
            col_indices = np.random.choice(all_col_indices, size=m,
                                           replace=False)
            sample = X[row_indices, :][:, col_indices]
            sample_target = target[row_indices]
            sample_treatment = treatment[row_indices]
            all_args.append((sample, sample_target, sample_treatment, i))
            self.samples_col_indices.append(col_indices)

        self.trees = parallel(self._fit_tree, self.n_jobs, all_args)

    @staticmethod
    def _predict_from_tree(ctl: CausalTree, X: np.ndarray) -> np.ndarray:
        prediction = ctl.predict(X)
        return prediction

    def predict(self, X: np.ndarray) -> np.ndarray:
        all_args = [
            (tree, X[:, col_indices])
            for tree, col_indices in zip(self.trees, self.samples_col_indices)
        ]
        predictions_all = parallel(self._predict_from_tree, self.n_jobs,
                                   all_args)
        predictions_mat = np.vstack(predictions_all)
        predictions = predictions_mat.mean(axis=0)
        return predictions

In [64]:
%%time
rf_clf = UpliftRandomForestClassifier(n_estimators=10, n_jobs=1)
rf_clf.fit(X_train, target_train, treatment_train)

parallel: one process
fit tree
fit tree
fit tree
fit tree
fit tree
fit tree
fit tree
fit tree
fit tree
fit tree
Wall time: 6.95 s


In [65]:
%%time
pred = rf_clf.predict(X_valid)

parallel: one process
Wall time: 733 ms


In [66]:
uplift_score(pred, treatment_valid, target_valid)

0.029774313423366228

In [41]:
t = rf_clf.trees[0]