In [1]:
import os
import random
import numpy as np
import pandas as pd
import typing as tp
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from catboost import CatBoostClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix,  accuracy_score, roc_auc_score

warnings.filterwarnings("ignore")

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

TEST_SIZE = 0.15
set_seed(560)



In [2]:
TRAIN_PATH = 'data/train_data.pqt'
TEST_PATH = 'data/test_data.pqt'
SAMPLE_SUBM_PATH = 'data/sample_submission.csv'
RESULT_PATH = 'data/top_g_segment.csv'
MODEL_PATH = 'data/agg_feats.cbm'
WEIGHTS_PATH = 'data/cluster_weights.xlsx'

# TRAIN_PROCESS_PATH = 'drive/MyDrive/Contests/IT_Purple_Hack/process_data/train_data.pqt'
# VAL_PROCESS_PATH = 'drive/MyDrive/Contests/IT_Purple_Hack/process_data/val_data.pqt'
# TEST_PROCESS_PATH = 'drive/MyDrive/Contests/IT_Purple_Hack/process_data/test_data.pqt'

train = pd.read_parquet(TRAIN_PATH)

In [3]:
train.fillna(0, inplace=True)

In [4]:
def generate_features(
        train_dataset: pd.DataFrame,
        num_features: tp.List[tp.Union[str, str]],
        test_dataset = None,
        target_col: str = 'end_cluster',
):

    train_df = train_dataset.copy()
    if isinstance(test_dataset, pd.DataFrame):
         test_df = test_dataset.copy()
    else:
         test_df = None
    for value in train_df[target_col].unique():
        for feat_1, feat_2 in num_features:
            reg = LinearRegression().fit(train_df[[feat_1]], train_df[[feat_2]])
            line = reg.predict(train_df[[feat_1]]).reshape(-1)
            train_df[f'reg_dist_{value}_{feat_2}_{feat_1}'] = train_df[feat_2].values - line
            if isinstance(test_dataset, pd.DataFrame):
                  line = reg.predict(test_df[[feat_1]]).reshape(-1)
                  test_df[f'reg_dist_{value}_{feat_2}_{feat_1}'] = test_df[feat_2] - line

    return train_df, test_df

In [5]:
import typing as tp

def generate_agg_features(
    dataset: pd.DataFrame,
    feats_to_agg_list: tp.List[str],
    feats_by_agg_list: tp.List[str],
    build_for_train: bool = True,
    agg_by_3rd_month: bool = True,
    agg_func_list: tp.List[str] = ['max', 'mean'],
    test_dataset= None
) -> pd.DataFrame:
    if build_for_train:
        train_df = dataset.copy()
        to_agg = dataset[dataset.date == 'month_3'].copy() if agg_by_3rd_month else dataset.copy()
        for agg_feat in feats_by_agg_list:
            grouped = to_agg.groupby(agg_feat)[feats_to_agg_list].agg(agg_func_list).reset_index()
            grouped.columns = [col[0] if col[1] == '' else 'agg_{}_{}_{}'.format(agg_feat, col[0], col[1]) for col in grouped.columns]
            train_df = pd.merge(train_df, grouped, on=agg_feat, how='left')
        return train_df
    else:
        test_df = test_dataset.copy()
        for agg_feat in feats_by_agg_list:
            subset_cols = list(dataset.filter(like=('agg_'+agg_feat)).columns) + [agg_feat]
            print(subset_cols)
            grouped = dataset[subset_cols].groupby(agg_feat).max()
            test_df = pd.merge(test_df, grouped, on=agg_feat, how='left')
        return test_df


In [6]:
def ohe_start_cluster(train_dataset: pd.DataFrame, test_dataset: pd.DataFrame) -> pd.DataFrame:

    result = test_dataset.copy()

    # Инициализация MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    products_train = train_dataset['start_cluster'].str.lstrip("{").str.rstrip("}").str.replace(" ", "").str.split(',')
    mlb.fit(products_train)

    products_test = result['start_cluster'].str.lstrip("{").str.rstrip("}").str.replace(" ", "").str.split(',')
    one_hot_encoded_labels = mlb.transform(products_test)
    for idx, product in tqdm(enumerate(mlb.classes_)):
        result['start_cluster_product_' + product] = one_hot_encoded_labels[:, idx]

    return result

In [7]:
def make_lags(dataset: pd.DataFrame,
              columns_not_to_use,
              drop_months: bool=False,
              lags: bool=True) -> pd.DataFrame:

    result = dataset.copy()

    if lags:
        for column in tqdm(result.columns):

            if column not in columns_not_to_use:
                result[column + '_lag1'] = result.groupby('id')[column].shift(1)
                result[column + '_lag2'] = result.groupby('id')[column].shift(2)

    if drop_months:

        result = result.drop(result[(result['date'] == 'month_1') | (result['date'] == 'month_2')].index)
        result = result.drop(result[(result['date'] == 'month_4') | (result['date'] == 'month_5')].index)

    return result

### Split

In [8]:
feats_to_agg = ['balance_amt_min', 'balance_amt_max', 'sum_cred_e_oper_3m', 'sum_deb_h_oper_3m', 'sum_of_paym_2m', 'sum_of_paym_1y']
feats_by_agg = ['okved', 'city', 'channel_code']
agg_func = ['max', 'mean', 'min', 'median', 'count']
train = generate_agg_features(
    train,
    feats_to_agg,
    feats_by_agg,
    agg_func_list=agg_func,
)

In [9]:
from sklearn.model_selection import train_test_split
ids_and_clusters = train[train['date'] == 'month_3'][['id', 'end_cluster']].drop_duplicates()
train_ids, test_ids, _, _ = train_test_split(ids_and_clusters['id'], ids_and_clusters['end_cluster'], stratify=ids_and_clusters['end_cluster'], test_size=0.15, random_state=560, shuffle=True)

dataset_train = train[train['id'].isin(train_ids)]
dataset_val = train[train['id'].isin(test_ids)]

In [10]:
train_ohe = ohe_start_cluster(train, train)
not_to_lag = list(train.filter(like='agg_').columns) + ['id', 'date', 'end_cluster']
train_lagged = make_lags(train_ohe, columns_not_to_use=not_to_lag, drop_months=True)

0it [00:00, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

### Train dataset

In [11]:
X_train_ohe = ohe_start_cluster(dataset_train, dataset_train)
X_train_lagged = make_lags(X_train_ohe, columns_not_to_use=not_to_lag, drop_months=True, lags=True)

0it [00:00, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

In [12]:
print(dataset_train.shape)
print(X_train_ohe.shape)
print(X_train_lagged.shape)

(510000, 183)
(510000, 196)
(170000, 402)


In [13]:
X_train_lagged[['id', 'date', 'start_cluster', 'start_cluster_lag1', 'start_cluster_lag2', 'start_cluster_product_α', 'start_cluster_product_α_lag1', 'start_cluster_product_α_lag2']].head(10)

Unnamed: 0,id,date,start_cluster,start_cluster_lag1,start_cluster_lag2,start_cluster_product_α,start_cluster_product_α_lag1,start_cluster_product_α_lag2
2,0,month_3,"{α, γ}","{α, γ}","{α, γ}",1,1.0,1.0
5,1,month_3,{other},{other},{other},0,0.0,0.0
8,2,month_3,{α},{α},{α},1,1.0,1.0
11,3,month_3,{α},{α},{α},1,1.0,1.0
14,4,month_3,{α},{α},{α},1,1.0,1.0
17,5,month_3,{α},{},{},1,0.0,0.0
20,6,month_3,"{α, γ}","{α, γ}","{α, γ}",1,1.0,1.0
23,7,month_3,{α},{α},{α},1,1.0,1.0
26,8,month_3,"{α, γ}","{α, γ}","{α, γ}",1,1.0,1.0
29,9,month_3,{α},{α},{α},1,1.0,1.0


In [15]:
X_train, y_train = X_train_lagged.drop(columns=['end_cluster']), X_train_lagged['end_cluster']

In [16]:
X_train['changed_m1_m2'] = (X_train['start_cluster_lag2'] != X_train['start_cluster_lag1']) * 1
X_train['changed_m2_m3'] = (X_train['start_cluster_lag1'] != X_train['start_cluster']) * 1
X_train['changed_m1_m3'] = (X_train['start_cluster_lag2'] != X_train['start_cluster']) * 1

In [17]:
X_train[['changed_m1_m2', 'changed_m2_m3', 'changed_m1_m3']].sum()

changed_m1_m2    15426
changed_m2_m3    20185
changed_m1_m3    33163
dtype: int64

### Val dataset

In [18]:
# dataset_val.loc[dataset_val['date'] == 'month_3', 'start_cluster'] = np.nan
X_val_ohe = ohe_start_cluster(dataset_train, dataset_val)
X_val_lagged = make_lags(X_val_ohe, columns_not_to_use=not_to_lag, drop_months=True, lags=True)

0it [00:00, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

In [19]:
print(dataset_val.shape)
print(X_val_ohe.shape)
print(X_val_lagged.shape)

(90000, 183)
(90000, 196)
(30000, 402)


In [None]:
# X_val_lagged.to_parquet(VAL_PROCESS_PATH)

In [20]:
X_val, y_val = X_val_lagged.drop(columns=['end_cluster']), X_val_lagged['end_cluster']

In [21]:
X_val['changed_m1_m2'] = (X_val['start_cluster_lag2'] != X_val['start_cluster_lag1']) * 1
X_val['changed_m2_m3'] = (X_val['start_cluster_lag1'] != X_val['start_cluster']) * 1
X_val['changed_m1_m3'] = (X_val['start_cluster_lag2'] != X_val['start_cluster']) * 1

### Train baseline

In [22]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

cat_cols_lagged = []
for col in cat_cols:
    cat_cols_lagged.append(col)
    cat_cols_lagged.append(col + '_lag1')
    cat_cols_lagged.append(col + '_lag2')

X_train.fillna('none', inplace=True)
X_train[cat_cols_lagged] = X_train[cat_cols_lagged].astype("category")

X_val.fillna('none', inplace=True)
X_val[cat_cols_lagged] = X_val[cat_cols_lagged].astype("category")

In [23]:
X_train.shape

(170000, 404)

In [24]:
X_val.shape

(30000, 404)

In [25]:
X_train = X_train.drop(columns=["id", "date"])
X_val = X_val.drop(columns=["id", "date"])

In [26]:
categorical_features_indices = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype == 'category']
X_train.iloc[:, categorical_features_indices] = X_train.iloc[:, categorical_features_indices].astype(str)

In [29]:
y_train.shape

(170000,)

In [None]:
from imblearn.over_sampling import SMOTENC
sm = SMOTENC(random_state=560, categorical_features=categorical_features_indices,
            sampling_strategy='minority', k_neighbors=3)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
# catboost
params_cat = {
    'cat_features': cat_cols_lagged,
    # 'n_estimators': 5000,
    'iterations': 3000,
    'task_type':'GPU',
    'devices': '3'
}
model = CatBoostClassifier(**params_cat)
model.fit(X_train, y_train, cat_features=cat_cols_lagged, eval_set=(X_val, y_val))

In [None]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(
        y_true, y_pred, labels=labels, multi_class="ovr", average=None
    )
    return sum(weights * classes_roc_auc), classes_roc_auc


cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [None]:
y_pred_proba = model.predict_proba(X_val)

weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

In [None]:
# 0.909668 0.9103184735953871

In [None]:
y_pred = model.predict(X_val)

In [None]:
accuracy_score(y_val, y_pred)

In [None]:
# 0.731066

### Confusion matrix

In [None]:
cm = confusion_matrix(y_val, y_pred)

plt.figure(figsize=(17, 17))
sns.heatmap(cm,
            annot=True,
            fmt='g',
            xticklabels=np.unique(y_val.ravel()),
            yticklabels=np.unique(y_val.ravel()),
            vmin=0, vmax=2000)
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.ylabel('Prediction', fontsize=11, rotation=0)
plt.xlabel('Actual', fontsize=11)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

In [None]:
def plot_feature_importance(importance, names, model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df = fi_df.iloc[:30]

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
#plot the catboost result
plot_feature_importance(model.get_feature_importance(), X_val.columns, 'CATBOOST')

In [None]:
model.save_model('smote',
                 format="cbm",
                 export_parameters=None,
                 pool=None)

In [None]:
# model1 = CatBoostClassifier()
# model1.load_model(MODEL_PATH)

In [None]:
train['okved']

### Test dataset

In [None]:
test = pd.read_parquet(TEST_PATH)

In [None]:
feats_by_agg

In [None]:
train

In [None]:
test.fillna(0, inplace=True)
test = generate_agg_features(
    train,
    feats_to_agg,
    feats_by_agg,
    build_for_train=False,
    agg_func_list=agg_func,
    test_dataset=test
)

In [None]:
test.filter(like='agg')

In [None]:
def predict_start(
    dataset_df: pd.DataFrame,
    cat_features: tp.List[str],
    task_type: str = 'GPU',
    model = None,
    params = None,
):
    """
    Заполняем пропуски в start_cluster для 6 месяца, обучаемся на 2
    Возвращаем копию датасета и модель
    """
    dataset = dataset_df.copy()
    dataset.fillna(0, inplace=True)
    cat_features = [feat for feat in cat_features if feat != 'start_cluster']
    cat_features = cat_features + ['prev_month']
    if not params:
        params = {
            'random_state': 560,
            'iterations': 3000,
            'task_type': 'GPU',
            'devices':'3',
            'cat_features': cat_features,
        }
    if not model:
        model1 = CatBoostClassifier(**params)
    month_4 = dataset[dataset.date == 'month_4'].copy()
    month_5 = dataset[dataset.date == 'month_5'].copy()
    month_6 = dataset[dataset.date == 'month_6'].copy()
    train_dataset = month_5.merge(month_4[['id', 'start_cluster']], on='id', how='right')
    pred_dataset = month_6.merge(month_5[['id', 'start_cluster']], on='id', how='right')
    train_dataset.rename(columns={'start_cluster_x': 'target', 'start_cluster_y': 'prev_month'}, inplace=True)
    pred_dataset.rename(columns={'start_cluster_x': 'target', 'start_cluster_y': 'prev_month'}, inplace=True)
    X = train_dataset.drop(["id", "date", "target"], axis=1)
    y = train_dataset["target"]
    model1.fit(X, y, verbose=False)
    dataset.loc[dataset.date == 'month_6', 'start_cluster'] = model1.predict(pred_dataset.drop(["id", "date", "target"], axis=1))
    return dataset, model

In [None]:
test.filter(like='city_type')

In [None]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

dataset_test, _ = predict_start(test, cat_cols)

In [None]:
test['start_cluster'] = dataset_test['start_cluster'].copy()

In [None]:
test.fillna(0, inplace=True)

# pairs = [('cnt_cred_h_oper_3m', 'sum_cred_h_oper_3m')]
# dataset_train, test = generate_features(train_dataset=dataset_train,
#                                         num_features=pairs,
#                                         test_dataset=test)

In [None]:
X_test_ohe = ohe_start_cluster(train, test)
X_test = make_lags(X_test_ohe, columns_not_to_use=not_to_lag, drop_months=True)

In [None]:
# X_test.to_parquet(TEST_PROCESS_PATH)

In [None]:
X_test[['id', 'date', 'start_cluster', 'start_cluster_lag1', 'start_cluster_lag2']].iloc[:15]

In [None]:
print(test.shape)
print(X_test_ohe.shape)
print(X_test.shape)

In [None]:
X_test['changed_m1_m2'] = (X_test['start_cluster_lag2'] != X_test['start_cluster_lag1']) * 1
X_test['changed_m2_m3'] = (X_test['start_cluster_lag1'] != X_test['start_cluster']) * 1
X_test['changed_m1_m3'] = (X_test['start_cluster_lag2'] != X_test['start_cluster']) * 1

In [None]:
X_test[['changed_m1_m2', 'changed_m2_m3', 'changed_m1_m3']].sum()

In [None]:
X_test.fillna('none', inplace=True)
X_test[cat_cols_lagged] = X_test[cat_cols_lagged].astype("category")
X_test = X_test.drop(columns=["id", "date"])

In [None]:
cat_cols_lagged

In [None]:
test_pred_proba = model.predict_proba(X_test)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [None]:
test_pred_proba_df.to_csv('IPA.csv', index=False)

### Submission

In [None]:
sample_submission_df = pd.read_csv('data/sample_submission.csv')

In [None]:
sample_submission_df

In [None]:
test_pred_proba_df

In [None]:
chopa = pd.read_csv('data/gpt_classif.csv')

In [None]:
test_pred_proba_df

In [None]:
chopa[sorted_classes]

In [None]:
top_g_model = pd.read_csv(RESULT_PATH)
top_g_blend = pd.read_csv('data/top_g_blend.csv')

In [None]:
sample_submission_df[sorted_classes] = (test_pred_proba_df[sorted_classes] + top_g_blend[sorted_classes] + top_g_model[sorted_classes]) / 3
sample_submission_df.to_csv('йорш.csv', index=False)

In [None]:
sample_submission_df[sorted_classes] = test_pred_proba_df[sorted_classes]

In [None]:
sample_submission_df.to_csv('Cider.csv', index=False)