In [None]:
!pip install gradio
!pip install econml
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import sys

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

from econml.metalearners import SLearner, TLearner, XLearner

import warnings
from numba.core.errors import NumbaDeprecationWarning

# Global lists for convenience
day_of_week_labels = ['Понедельник', 'Вторник', 'Среда', 'Четверг', 'Пятница', 'Суббота', 'Воскресенье']
months_labels = ['Январь', 'Февраль', 'Май', 'Апрель', 'Май', 'Июнь', 'Июль', 'Август', 'Сентябрь', 'Октябрь',
                 'Ноябрь', 'Декабрь']
hierarchy = {"Бронзовый ключ": 0, "Серебряный ключ": 1, "Золотой ключ": 2, "Платиновый ключ": 3, "Сотрудник": 4}

In [2]:
class TreatmentEstimator:
    def __init__(self, args):
        """"
        file - имя файла с транзакциями клиентов
        T - тип воздействия. Может быть выбран одним из дефолтных - повышение или понижение статуса.
        Может быть задан самостоятельно через определение колонки
        Y - анализируемые результаты воздействия (y)
        Val - способ подбора гиперпараметров обучения
        max_clients - максимальное число возвращаемых клиентов
        tol - минимальный порог на cate, чтобы клиент попал в список восприимчивых
        """
        # dataset_file, treatment_radio, column_text, aggregation_radio, outcome_radio, validation_radio, cate_tol_text
        file_index, t_index, col_index, agg_index, y_index, val_index, max_clients_index, tol_index = 0, 1, 2, 3, 4, 5, 6, 7

        if isinstance(args[file_index], str):
          file_name = args[file_index]
        else:
          file_name = args[file_index].name

        # Treatment Type
        if args[t_index] in ['StatusAscending', 'StatusDescending']:
            self.treatment_type = args[t_index]
            self.aggregation_func = None
        else:
            self.treatment_type = args[col_index]
            if args[agg_index] == 'Any':
                self.aggregation_func = np.any
            elif args[agg_index] == 'All':
                self.aggregation_func = np.all
            elif args[agg_index] == 'Mode':
                self.aggregation_func = stats.mode
            else:
                raise ValueError("Unknown Accumulation method")

        # Outcome Type
        if args[y_index] in ['Conversion', 'Frequency', 'Monetary']:
            self.outcome_type = args[y_index]
        else:
            raise ValueError("Unknown Outcome type")

        # Validation Type
        if args[val_index] in ['PredefinedHyperparameters', 'FastValidation', 'FullValidation']:
            self.validation_type = args[val_index]
        else:
            raise ValueError("Unknown Validation type")

        # Tolerance
        self.tol = float(args[tol_index])
        # Max clients
        self.max_clients = int(args[max_clients_index]) if args[max_clients_index] != '' else None

        # Read File
        self.transactions = pd.read_csv(file_name, sep=';', parse_dates=['Дата'], dayfirst=True)
        self.transactions.rename(
            columns={"Дата": "Date", "Профиль участника": "UID", "Вид операции": "OperationType", "Сумма": "Price",
                     "Сумма списанных монет": "SpentCoins", "Сумма начисленных монет": "GainedCoins",
                     "Ресторан": "Point", "Агент продаж": "Agent", "Статус": "Status"}, inplace=True)
        self.transactions.set_index('Date', inplace=True)
        self.transactions.sort_index(inplace=True)
        self.transactions.drop(columns=['Номер'], inplace=True)

        # others
        self.start_dt = None
        self.end_dt = None
        self.period = None

        self.clients = None

        self.X = None
        self.y = None
        self.treatment = None
        self.categorical_features_names = []
        self.numerical_features_names = []

        self.best_params_default = {'max_depth': 40, 'n_estimators': 600, 'lr': 0.001}
        self.model = None

    def preprocess(self):
        """
        Предобработка данных
        """
        self.start_dt, self.end_dt = self.transactions.iloc[0].name, self.transactions.iloc[-1].name
        self.period = self.end_dt - self.start_dt
        print(f'Датасет начинается {self.start_dt.day} {months_labels[self.start_dt.month - 1][:-1]}я и '
              f'заканчивается {self.end_dt.day} {months_labels[self.end_dt.month - 1][:-1]}я включительно')

        m, n = self.transactions.shape
        numerical = ['Price', 'SpentCoins', 'GainedCoins', 'ActualPrice']
        categorical = ['OperationType', 'Point', 'Agent', 'Status']

        print(f"В датасете {m} строк и {n} столбцов")
        print(f"Из них {len(numerical)} численных и {len(categorical)} категориальных")
        print(self.transactions.columns)
        assert (len(numerical) + len(categorical) == n)

        cols_with_nans = []
        for col in self.transactions:
            if self.transactions[col].isna().sum() > 0:
                cols_with_nans.append(col)
                self.transactions[col].fillna(0, inplace=True)
        print("Найдены пропуски в колонках", *cols_with_nans)
        print("Пропуски заполнены нулями")

        self.transactions['DayOfWeek'] = self.transactions.index.dayofweek
        self.transactions['Month'] = self.transactions.index.month
        self.transactions['Hour'] = self.transactions.index.hour
        self.transactions['DayOfYear'] = self.transactions.index.dayofyear

        self.transactions['Discount'] = self.transactions['Status'].replace(
            {"Бронзовый ключ": 0.02, "Серебряный ключ": 0.03, "Золотой ключ": 0.05, "Платиновый ключ": 0.07,
             "Сотрудник": 0.15})

        self.transactions['StatusNew'] = self.transactions["Status"].apply(lambda x: hierarchy[x])

    def set_treatment(self):
        """
        Определение воздействия
        """
        if self.treatment_type in ['StatusAscending', 'StatusDescending']:
            middle_data = self.transactions.loc[self.start_dt + self.period / 3:self.end_dt - self.period / 3].groupby(
                'UID').StatusNew
            if self.treatment_type == 'StatusAscending':
                self.clients['TreatmentStatus'] = middle_data.is_monotonic_increasing & (middle_data.nunique() > 1)
            else:
                self.clients['TreatmentStatus'] = middle_data.is_monotonic_decreasing & (middle_data.nunique() > 1)
            self.clients['TreatmentStatus'].fillna(value=False, inplace=True)
            self.clients['TreatmentStatus'] = self.clients['TreatmentStatus'].astype(int)
        else:
            self.clients['TreatmentStatus'] = self.aggregation_func(
                self.transactions.groupby('UID')[self.treatment_type])

    def set_outcome(self):
        """
        Определение результата возьдействия
        """
        if self.outcome_type == 'Conversion':
            self.clients['outcome'] = self.transactions.loc[self.start_dt - self.period / 3:].groupby('UID')\
                .Price.mean()
        elif self.outcome_type == 'Monetary':
            self.clients['outcome'] = self.transactions.loc[self.start_dt - self.period / 3:].groupby('UID').Price.sum()
        elif self.outcome_type == 'Frequency':
            self.clients['outcome'] = self.transactions.loc[self.start_dt - self.period / 3:].groupby('UID').UID.count()

        self.clients['outcome'].fillna(0, inplace=True)

    def set_features(self):
        """
        Добавление признакового описания клиентов
        """
        period = self.end_dt - self.start_dt
        self.clients['Recency'] = self.transactions.loc[:self.start_dt + period / 3].groupby('UID').DayOfYear.max()
        self.clients['Frequency'] = self.transactions.loc[:self.start_dt + period / 3].groupby('UID').Price.count()
        self.clients['Monetary_Sum'] = self.transactions.loc[:self.start_dt + period / 3].groupby('UID').Price.sum()
        self.clients['Monetary_Mean'] = self.transactions.loc[:self.start_dt + period / 3].groupby('UID').Price.mean()

        self.clients['FavouriteDay'] = self.transactions.groupby(
            ['UID', 'DayOfWeek']).Price.mean().to_frame().reset_index().sort_values(
            by=['UID', 'Price'], ascending=False).groupby('UID').DayOfWeek.first()

        self.transactions['SberSpasibo'] = 0
        self.transactions.loc[
            self.transactions.Price - self.transactions.GainedCoins / self.transactions.Discount > 10,
            'SberSpasibo'] = 1
        self.clients['SberSpasibo'] = self.transactions.groupby('UID').SberSpasibo.any()
        self.clients['SberSpasibo'] = self.clients['SberSpasibo'].astype(int)

        self.numerical_features_names = ['Recency', 'Frequency', 'Monetary_Sum', 'Monetary_Mean', 'SberSpasibo']
        self.categorical_features_names = ['FavouriteDay']

    def make_clients(self):
        """
        Функция для создания датасета клиентов с их характеристиками
        """
        self.clients = self.transactions.groupby('UID').Status.last().to_frame()

        self.set_features()
        self.set_treatment()
        self.set_outcome()

        self.clients.dropna(inplace=True)
        print(f'Количество клиентов {self.clients.shape[0]}')

    def compute_qini(self, cate):
        """
        Функция для вычисления коэффициента Qini
        :param cate - значение cate для клиентов из self.clients
        """
        df = pd.DataFrame([
            cate.ravel(),
            self.treatment.ravel(),
            self.y.ravel()],
            index=['cate', 't', 'y']).T

        sorted_df = df.sort_values('cate', ascending=False).reset_index(drop=True)
        sorted_df.index = sorted_df.index + 1
        sorted_df["cumsum_tr"] = sorted_df['t'].cumsum()
        sorted_df["cumsum_ct"] = sorted_df.index.values - sorted_df["cumsum_tr"]
        sorted_df["cumsum_y_tr"] = (sorted_df['y'] * sorted_df['t']).cumsum()
        sorted_df["cumsum_y_ct"] = (sorted_df['y'] * (1 - sorted_df['t'])).cumsum()
        qini = (sorted_df["cumsum_y_tr"] - sorted_df["cumsum_y_ct"] * sorted_df["cumsum_tr"] / sorted_df["cumsum_ct"])
        return qini.sum()

    def set_model(self):
        """
        Функция для создания оптимальной модели
        """
        if self.validation_type == 'PredefinedHyperparameters':
            self.model = XLearner(
                models=XGBRegressor(n_estimators=self.best_params_default['n_estimators'],
                                    max_depth=self.best_params_default['max_depth'],
                                    learning_rate=self.best_params_default['lr']))
        elif self.validation_type == 'FastValidation':
            best_qini = 0
            for n_estimators in [200, 400, 600]:
                for max_depth in [20, 40, 60]:
                    for lr in [1e-3, 1e-2, 1e-1]:
                        learner = XLearner(models=XGBRegressor(n_estimators=n_estimators,
                                                               max_depth=max_depth,
                                                               learning_rate=lr))
                        learner.fit(X=self.X, T=self.treatment, Y=self.y)
                        cate = learner.effect(self.X)
                        qini = self.compute_qini(cate)
                        if qini > best_qini:
                            best_qini = qini
                            self.model = learner

        elif self.validation_type == 'FullValidation':
            best_qini = 0
            for n_estimators in [200, 400, 600]:
                for max_depth in [20, 40, 60]:
                    for lr in [1e-3, 1e-2, 1e-1]:
                        s_learner = SLearner(overall_model=XGBRegressor(n_estimators=n_estimators,
                                                                        max_depth=max_depth,
                                                                        learning_rate=lr))
                        s_learner.fit(X=self.X, T=self.treatment, Y=self.y)
                        cate = s_learner.effect(self.X)
                        s_qini = self.compute_qini(cate)
                        if s_qini > best_qini:
                            best_qini = s_qini
                            self.model = s_learner

                        t_learner = TLearner(models=XGBRegressor(n_estimators=n_estimators,
                                                                 max_depth=max_depth,
                                                                 learning_rate=lr))
                        t_learner.fit(X=self.X, T=self.treatment, Y=self.y)
                        cate = t_learner.effect(self.X)
                        t_qini = self.compute_qini(cate)
                        if t_qini > best_qini:
                            best_qini = t_qini
                            self.model = t_learner

                        x_learner = XLearner(models=XGBRegressor(n_estimators=n_estimators,
                                                                 max_depth=max_depth,
                                                                 learning_rate=lr))
                        x_learner.fit(X=self.X, T=self.treatment, Y=self.y)
                        cate = x_learner.effect(self.X)
                        x_qini = self.compute_qini(cate)
                        if x_qini > best_qini:
                            best_qini = x_qini
                            self.model = x_learner

    def fit(self):
        """
        Обучение модели
        """
        self.X = self.clients[self.numerical_features_names + self.categorical_features_names]
        self.y = self.clients["outcome"]
        self.treatment = self.clients["TreatmentStatus"]

        column_transformer = ColumnTransformer([
            ('ohe', OneHotEncoder(handle_unknown='ignore'), self.categorical_features_names),
            ('scaling', StandardScaler(), self.numerical_features_names),
        ])

        self.X = column_transformer.fit_transform(self.X)

        self.set_model()
        print("Model is set")

        self.model.fit(X=self.X, T=self.treatment, Y=self.y)
        print("Model is fitted")

    def predict(self):
        """
        Формирования итогового списка внушаемых клиентов
        """
        cate = self.model.effect(self.X)
        print('CATE is estimated') 

        #print('Qini: ', self.compute_qini(cate))
        self.clients['CATE'] = cate
        receptive_clients = self.clients[self.clients['CATE'] > self.tol]['CATE']
        if self.max_clients is not None:
          receptive_clients = receptive_clients.iloc[:self.max_clients]
        receptive_clients = receptive_clients.to_frame().reset_index().sort_values(by='CATE', ascending=False)
        #print(receptive_clients.shape)
        receptive_clients.to_csv('receptive_clients.csv', index=False)
        
        return receptive_clients

In [3]:
import numpy as np
import gradio as gr

def fn(dataset_file, treatment_radio, column_text, aggregation_radio, outcome_radio, validation_radio, max_clients, cate_tol_text):
    model = TreatmentEstimator([dataset_file, treatment_radio, column_text, aggregation_radio, outcome_radio, validation_radio, max_clients, cate_tol_text])
    model.preprocess()
    model.make_clients()
    model.fit()
    model.predict()

    return 'receptive_clients.csv'

"""def fn():
  df = pd.DataFrame(columns=['Client', 'CATE'])
  df.loc[0] = ['e7617eb3-ee56-11ec-80dd-22145dv23e20', 323.515]
  df.loc[1] = ['e2312ty4-rr35-44sf-09gd-12345df56s32', 217.923]
  return df"""

def change_column_and_aggr(choice):
    if choice == "ArbitraryTransactionTreatment":
        return gr.update(visible=True)
    else:
        return gr.update(visible=False)

def max_clients_checkbox_show(choice):
    return gr.update(visible=choice)

def update_time_info(choice):
    if choice == "PredefinedHyperparameters":
        return gr.update(value="Процесс займёт ~10 минут", visible=True)
    elif choice == "FastValidation":
        return gr.update(value="Процесс займёт ~2.5 часа", visible=True)
    else:
        return gr.update(value="Процесс займёт ~7 часов", visible=True)

with gr.Blocks() as demo:
    gr.Markdown("""# ML-based Marketing

    В данной работе наша команда поставила перед собой задачу написать программу,
    способную выделять из всех клиентов компании восприимчивую к маркетинговым про-
    дуктам аудиторию, чтобы в последствии использовать эти данные при разработке и про-
    ведении маркетинговой стратегии""")
    dataset_file = gr.File(label='CSV file')
    treatment_radio = gr.Radio(
        ["StatusAscending", "StatusDescending", "ArbitraryTransactionTreatment"], label="Treatment Type"
    )
    column_text = gr.Textbox(label='ColumnName', visible=False, interactive=True)
    aggregation_radio = gr.Radio(
        ["Any", "Mode", "All"], label="Aggregation Method", visible=False, interactive=True
    )
    treatment_radio.change(fn=change_column_and_aggr, inputs=treatment_radio, outputs=column_text)
    treatment_radio.change(fn=change_column_and_aggr, inputs=treatment_radio, outputs=aggregation_radio)
    outcome_radio = gr.Radio(
        ["Conversion", "Frequency", "Monetary"], label="Outcome Type"
    )
    validation_radio = gr.Radio(
        ["PredefinedHyperparameters", "FastValidation", "FullValidation"], label="Validation Type"
    )
    time_info = gr.Markdown("", visible=False, interactive=True)
    validation_radio.change(fn=update_time_info, inputs=validation_radio, outputs=time_info)
    max_clients_checkbox = gr.Checkbox(label="Ограничивать число клиентов")
    max_clients = gr.Textbox(label='Max clients', value=None, visible=False)
    max_clients_checkbox.change(fn=max_clients_checkbox_show, inputs=max_clients_checkbox, outputs=max_clients)
    cate_tol_text = gr.Textbox(label='CATE tolerance', value='0')

    #model_output = gr.Dataframe(headers=['Client', 'CATE'])
    model_output = gr.File(label='Output file')
    
    submit_button = gr.Button("Submit")
    submit_button.click(fn, inputs=[dataset_file, treatment_radio, column_text, aggregation_radio, outcome_radio, validation_radio, max_clients, cate_tol_text], outputs=model_output)

demo.launch(share=False) #debug=True, 

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Датасет начинается 15 Января и заканчивается 14 Апреля включительно
В датасете 2267437 строк и 8 столбцов
Из них 4 численных и 4 категориальных
Index(['UID', 'OperationType', 'Price', 'SpentCoins', 'GainedCoins', 'Point',
       'Agent', 'Status'],
      dtype='object')
Найдены пропуски в колонках SpentCoins GainedCoins
Пропуски заполнены нулями
Количество клиентов 256419
Model is set
Model is fitted
CATE is estimated
Qini:  10361587286310.959
(20, 2)


In [None]:
fsdm