In [3]:
import datetime
import pandas as pd
import itertools

from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from scipy.stats import entropy
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")
from typing import List, NewType
EntropyCategoricalEmbedderObject = NewType('EntropyCategoricalEmbedderObject', object)

In [13]:
def missing_values(df_old):
    missing_values = ((df_old.isna().sum() / len(df_old)) * 100).sort_values(ascending=False)
    missing_values = pd.DataFrame(missing_values)  # датафрейм процент пропущенных значений для каждого признака
    df_missing_drop = missing_values[missing_values[0] > 80]  # датафрейм признаков которые подлежат удалению
    df_missing_mode = missing_values[
        (missing_values[0] < 20) & (
        (missing_values[0] > 0))]  # датафрейм признаков которые подлежат заполнению модой
    df_missing_other = missing_values[
        (missing_values[0] > 20) & ((missing_values[0] < 80))]  # датафрейм признаков кот.
    # подлежат заполению other
    missing_drop = df_missing_drop.index.to_list()
    missing_mode = df_missing_mode.index.to_list()
    missing_other = df_missing_other.index.to_list()

    # списки колонок с пропущенными значениями
    df_old = df_old.drop(columns=missing_drop)

    for column in missing_mode:
        df_old[column] = df_old[column].fillna(df_old[column].mode()[0])

    for column in missing_other:
        df_old[column] = df_old[column].fillna('other')

    return df_old.copy()

# в датасете имеются скрытые пропуски. так как их незначительное количество для упрощения заполним их other

def hidden_gaps(df_gaps):
    list_columns = df_gaps.columns
    for column in list_columns:
        df_gaps.loc[df_gaps[column] == '(not set)', column] = 'other'
        df_gaps.loc[df_gaps[column] == '(none)', column] = 'other'

    return df_gaps.copy()

def normal_device_browser(df_db):

    def normal_str(str_list):
        if str_list[0:9] == 'Instagram':
            str_list = 'Instagram'
        if str_list[0:7] == 'Threads':
            str_list = 'Threads'
        if str_list[0:5] == '[FBAN':
            str_list = 'Facebook'
        if str_list[0:10] == 'helloworld':
            str_list = 'other'

        return str_list

    df_db.device_browser = df_db.device_browser.apply(normal_str)

    return df_db.copy()

def normal_screen_resolution(df_sr):

    def pixels(screen_resolution):
        screen_list = screen_resolution.split('x')
        pixels = int(screen_list[0]) * int(screen_list[1])

        return pixels

    def pixels_range(pixel):
        if pixel == 0:
            resolution = 'other'
        elif (pixel > 0) & (pixel < 300000):
            resolution = 'low'
        elif pixel > 375000:
            resolution = 'high'
        else:
            resolution = 'medium'

        return resolution

    df_sr.loc[df_sr['device_screen_resolution'] == '(not set)', 'device_screen_resolution'] = '0x0'
    df_sr.device_screen_resolution = df_sr.device_screen_resolution.apply(pixels)
    df_sr.device_screen_resolution = df_sr.device_screen_resolution.apply(pixels_range)

    return df_sr.copy()

class EntropyCategoricalEmbedder:
    """Unsupervised categorical embedder based on group counts and entropy calculation

    fit - get dictionary for the transformation of categorical objects into embeddings
    transform - map the dictionary onto your categorical dataset to get the embeddings
    """

    def __init__(self):
        self.substitute_dict = {}  # resulting dictionary to transform the objects into embs

    def __repr__(self):
        return self.__class__.__name__ + "()"

    @staticmethod
    def cat_prep(data: pd.DataFrame) -> pd.DataFrame:

        """change category names for simplification

        format -> category-name_category-name
        """

        data_new = data.copy()
        for col in data.columns:
            data_new[col] = data[col].apply(lambda x: col + '_' + str(x))
        return data_new

    def fit(self, df_train: pd.DataFrame,
            verbose: bool = True) -> EntropyCategoricalEmbedderObject:  # we created this custom type earlier
        """Create dictionary to map on the dataset

        !!!Works only with categorical datasets!!!
        dataset - pandas DataFrame with only categorical columns in str format (after cat_prep)
        (each row is our object to get an embedding for)
        """

        feature_list = list(df_train.columns)
        df = df_train.copy()
        df['id'] = df.index
        for group_key in feature_list:
            passive_keys = feature_list[:]
            passive_keys.remove(group_key)

            category_embedding_mapping = {}
            for passive_key in passive_keys:
                if verbose:
                    print('--- groupby: group_key - ', group_key, '### passive_key - ', passive_key, '---')
                group = df.groupby([group_key, passive_key])['id'].count()
                group = group.unstack().fillna(0)
                entropy_values = group.apply(entropy, axis=1)
                for cat, entropy_value in entropy_values.to_dict().items():
                    if cat in category_embedding_mapping:
                        category_embedding_mapping[cat].extend([entropy_value])
                    else:
                        category_embedding_mapping[cat] = [entropy_value]

            self.substitute_dict[group_key] = category_embedding_mapping
        return self

    def transform(self, dataset: pd.DataFrame,
                  fill_unknown_cat_value: int = 0,
                    verbose: bool = False) -> List[list]:
        """Get embedding for each categorical row of the dataset

        !!!Works only with categorical datasets!!!
        dataset - pandas DataFrame with only categorical columns in str format (after cat_prep)
        (each row is our object to get an embedding for)
        fill_unknown_cat_value - the value to fill embedding vector for unknown categories
        """

        dataset = dataset.copy()
        feature_list = list(dataset.columns)
        emb_size = len(feature_list) - 1
        if verbose:
            print("Mapping vectors to categories...")
        for f in feature_list:
            dataset[f] = dataset[f].map(self.substitute_dict[f])
            dataset[f] = dataset[f].fillna('empty')
            dataset[f] = dataset[f].apply(lambda x: [fill_unknown_cat_value] * emb_size if x == 'empty' else x)

        embeddings_list = []
        if verbose:
            print("Creating an embedding for each row...")
        for row in dataset[feature_list].itertuples():
            embeddings_list.append(list(itertools.chain(*row[1:])))

        return embeddings_list

def encoder_cat(df_enc):
    df_enc = EntropyCategoricalEmbedder.cat_prep(df_enc)
    embedder = EntropyCategoricalEmbedder()
    embedder.fit(df_enc, verbose=False)
    df_feat = embedder.transform(df_enc)

    return df_feat.copy()

def gen_new_feat(df_old):

    def hit_len_path(df_1):
        df_1['hit_len_path'] = df_1['hit_page_path'].apply(lambda x: len(x))
        df_1 = df_1.drop(columns='hit_page_path')

        return df_1.copy()

    def hit_day_hour(df_2):
        df_2['hit_time'] = df_2['hit_time'].fillna(0)
        df_2['hit_time'] = df_2['hit_time'].apply(lambda x: datetime.datetime.fromtimestamp(x))
        df_2['hit_day'] = df_2['hit_date'].astype('datetime64').apply(lambda x: x.weekday())
        df_2['hit_hour'] = df_2['hit_time'].apply(lambda x: x.hour)
        df_2 = df_2.drop(columns=['hit_date', 'hit_time'])

        return df_2.copy()

    def visit_hour(df_3):
        df_3['visit_hour'] = df_3['visit_time'].astype('datetime64').apply(lambda x: x.hour)
        df_3 = df_3.drop(columns='visit_time')

        return df_3.copy()

    df_hit_len_path = hit_len_path(df_old)
    df_hit_day_hour = hit_day_hour(df_hit_len_path)
    df_new = visit_hour(df_hit_day_hour)

    return df_new.copy()

def normal_types(df_n):
    df_n['hit_number'] = df_n['hit_number'].astype('int')
    
    return df_n.copy()

In [8]:
%%time
df = pd.read_csv('data/df_key_action.csv')

CPU times: user 7.82 s, sys: 604 ms, total: 8.42 s
Wall time: 9.44 s


In [10]:
df.head()

Unnamed: 0,client_id,visit_time,visit_number,hit_number,hit_page_path,hit_date,hit_time,utm_source,utm_medium,utm_campaign,...,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,y
0,1573992000.0,10:00:00,1,4.0,podpiska.sberauto.com/?yclid=5850893405143489196,2021-10-05,,BHcvLfOaCWvWTykYqHVe,cpc,,...,,mobile,,Huawei,,360x780,YaBrowser,Russia,Saint Petersburg,0.0
1,1213626000.0,20:17:53,1,3.0,podpiska.sberauto.com/,2021-10-31,139.0,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,...,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,,375x812,Safari,Russia,Nalchik,0.0
2,1436292000.0,18:00:00,1,3.0,podpiska.sberauto.com/,2021-07-12,,geDcueAOghDzHkGMmdOq,cpm,FTjNLDyTrXaWYgZymFkV,...,,mobile,,Huawei,,360x720,Android Webview,Russia,Chelyabinsk,0.0
3,1600085000.0,14:38:49,2,23.0,sberauto.com/cars?utm_source_initial=yandex&ut...,2021-11-21,85019.0,BHcvLfOaCWvWTykYqHVe,cpc,,...,ITfrGJNwVsHBGJMAKoct,desktop,Windows,,,1920x1080,Chrome,Russia,Moscow,0.0
4,1286104000.0,21:00:00,8,5.0,sberauto.com/cars/all/mercedes-benz/e-klasse/d...,2021-10-15,,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,...,,mobile,,Apple,,390x844,Safari,Russia,Moscow,0.0


In [11]:
%%time
df_1 = gen_new_feat(df)

CPU times: user 18.4 s, sys: 624 ms, total: 19 s
Wall time: 19 s


In [12]:
df_1.head()

Unnamed: 0,client_id,visit_number,hit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,...,device_model,device_screen_resolution,device_browser,geo_country,geo_city,y,hit_len_path,hit_day,hit_hour,visit_hour
0,1573992000.0,1,4.0,BHcvLfOaCWvWTykYqHVe,cpc,,,,mobile,,...,,360x780,YaBrowser,Russia,Saint Petersburg,0.0,48,1,19,10
1,1213626000.0,1,3.0,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,...,,375x812,Safari,Russia,Nalchik,0.0,22,6,19,20
2,1436292000.0,1,3.0,geDcueAOghDzHkGMmdOq,cpm,FTjNLDyTrXaWYgZymFkV,WYLajZgbUhGimwBKDZUH,,mobile,,...,,360x720,Android Webview,Russia,Chelyabinsk,0.0,22,0,19,18
3,1600085000.0,2,23.0,BHcvLfOaCWvWTykYqHVe,cpc,,,ITfrGJNwVsHBGJMAKoct,desktop,Windows,...,,1920x1080,Chrome,Russia,Moscow,0.0,564,6,18,14
4,1286104000.0,8,5.0,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,...,,390x844,Safari,Russia,Moscow,0.0,76,4,19,21


In [14]:
%%time
df_2 = normal_types(df_1) #Добавить!!!!!!!!!!!!!!!!!!!!!!

CPU times: user 141 ms, sys: 43.7 ms, total: 185 ms
Wall time: 182 ms


In [15]:
df_2.head()

Unnamed: 0,client_id,visit_number,hit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,...,device_model,device_screen_resolution,device_browser,geo_country,geo_city,y,hit_len_path,hit_day,hit_hour,visit_hour
0,1573992000.0,1,4,BHcvLfOaCWvWTykYqHVe,cpc,,,,mobile,,...,,360x780,YaBrowser,Russia,Saint Petersburg,0.0,48,1,19,10
1,1213626000.0,1,3,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,...,,375x812,Safari,Russia,Nalchik,0.0,22,6,19,20
2,1436292000.0,1,3,geDcueAOghDzHkGMmdOq,cpm,FTjNLDyTrXaWYgZymFkV,WYLajZgbUhGimwBKDZUH,,mobile,,...,,360x720,Android Webview,Russia,Chelyabinsk,0.0,22,0,19,18
3,1600085000.0,2,23,BHcvLfOaCWvWTykYqHVe,cpc,,,ITfrGJNwVsHBGJMAKoct,desktop,Windows,...,,1920x1080,Chrome,Russia,Moscow,0.0,564,6,18,14
4,1286104000.0,8,5,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,...,,390x844,Safari,Russia,Moscow,0.0,76,4,19,21


In [16]:
%%time
df_3 = normal_device_browser(df_2)

CPU times: user 1.12 s, sys: 116 ms, total: 1.24 s
Wall time: 1.24 s


In [17]:
df_3.head()

Unnamed: 0,client_id,visit_number,hit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,...,device_model,device_screen_resolution,device_browser,geo_country,geo_city,y,hit_len_path,hit_day,hit_hour,visit_hour
0,1573992000.0,1,4,BHcvLfOaCWvWTykYqHVe,cpc,,,,mobile,,...,,360x780,YaBrowser,Russia,Saint Petersburg,0.0,48,1,19,10
1,1213626000.0,1,3,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,...,,375x812,Safari,Russia,Nalchik,0.0,22,6,19,20
2,1436292000.0,1,3,geDcueAOghDzHkGMmdOq,cpm,FTjNLDyTrXaWYgZymFkV,WYLajZgbUhGimwBKDZUH,,mobile,,...,,360x720,Android Webview,Russia,Chelyabinsk,0.0,22,0,19,18
3,1600085000.0,2,23,BHcvLfOaCWvWTykYqHVe,cpc,,,ITfrGJNwVsHBGJMAKoct,desktop,Windows,...,,1920x1080,Chrome,Russia,Moscow,0.0,564,6,18,14
4,1286104000.0,8,5,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,...,,390x844,Safari,Russia,Moscow,0.0,76,4,19,21


In [18]:
%%time
df_4 = normal_screen_resolution(df_3)

CPU times: user 2.12 s, sys: 176 ms, total: 2.3 s
Wall time: 2.29 s


In [19]:
df_4.head()

Unnamed: 0,client_id,visit_number,hit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,...,device_model,device_screen_resolution,device_browser,geo_country,geo_city,y,hit_len_path,hit_day,hit_hour,visit_hour
0,1573992000.0,1,4,BHcvLfOaCWvWTykYqHVe,cpc,,,,mobile,,...,,low,YaBrowser,Russia,Saint Petersburg,0.0,48,1,19,10
1,1213626000.0,1,3,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,...,,medium,Safari,Russia,Nalchik,0.0,22,6,19,20
2,1436292000.0,1,3,geDcueAOghDzHkGMmdOq,cpm,FTjNLDyTrXaWYgZymFkV,WYLajZgbUhGimwBKDZUH,,mobile,,...,,low,Android Webview,Russia,Chelyabinsk,0.0,22,0,19,18
3,1600085000.0,2,23,BHcvLfOaCWvWTykYqHVe,cpc,,,ITfrGJNwVsHBGJMAKoct,desktop,Windows,...,,high,Chrome,Russia,Moscow,0.0,564,6,18,14
4,1286104000.0,8,5,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,...,,medium,Safari,Russia,Moscow,0.0,76,4,19,21


In [20]:
%%time
df_5 = hidden_gaps(df_4)

CPU times: user 3.98 s, sys: 232 ms, total: 4.21 s
Wall time: 4.21 s


In [21]:
df_5.head()

Unnamed: 0,client_id,visit_number,hit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,...,device_model,device_screen_resolution,device_browser,geo_country,geo_city,y,hit_len_path,hit_day,hit_hour,visit_hour
0,1573991529.163342,1,4,BHcvLfOaCWvWTykYqHVe,cpc,,,,mobile,,...,,low,YaBrowser,Russia,Saint Petersburg,0.0,48,1,19,10
1,1213626005.16357,1,3,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,...,,medium,Safari,Russia,Nalchik,0.0,22,6,19,20
2,1436292332.16261,1,3,geDcueAOghDzHkGMmdOq,cpm,FTjNLDyTrXaWYgZymFkV,WYLajZgbUhGimwBKDZUH,,mobile,,...,,low,Android Webview,Russia,Chelyabinsk,0.0,22,0,19,18
3,1600085142.163691,2,23,BHcvLfOaCWvWTykYqHVe,cpc,,,ITfrGJNwVsHBGJMAKoct,desktop,Windows,...,,high,Chrome,Russia,Moscow,0.0,564,6,18,14
4,1286104109.163309,8,5,fDLlAcSmythWSCVMvqvL,other,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,...,,medium,Safari,Russia,Moscow,0.0,76,4,19,21


In [22]:
%%time
df_6 = missing_values(df_5)

CPU times: user 7.13 s, sys: 405 ms, total: 7.53 s
Wall time: 7.51 s


In [23]:
df_6.head()

Unnamed: 0,client_id,visit_number,hit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_screen_resolution,device_browser,geo_country,geo_city,y,hit_len_path,hit_day,hit_hour,visit_hour
0,1573991529.163342,1,4,BHcvLfOaCWvWTykYqHVe,cpc,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,other,mobile,other,Huawei,low,YaBrowser,Russia,Saint Petersburg,0.0,48,1,19,10
1,1213626005.16357,1,3,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,medium,Safari,Russia,Nalchik,0.0,22,6,19,20
2,1436292332.16261,1,3,geDcueAOghDzHkGMmdOq,cpm,FTjNLDyTrXaWYgZymFkV,WYLajZgbUhGimwBKDZUH,other,mobile,other,Huawei,low,Android Webview,Russia,Chelyabinsk,0.0,22,0,19,18
3,1600085142.163691,2,23,BHcvLfOaCWvWTykYqHVe,cpc,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,ITfrGJNwVsHBGJMAKoct,desktop,Windows,Apple,high,Chrome,Russia,Moscow,0.0,564,6,18,14
4,1286104109.163309,8,5,fDLlAcSmythWSCVMvqvL,other,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,other,mobile,other,Apple,medium,Safari,Russia,Moscow,0.0,76,4,19,21


In [26]:
## УДАЛИТЬ!!! client_id
df_6 = df_6.drop(columns='client_id')

In [29]:
%%time
X = df_6.drop(['y'], axis=1)
y = df_6['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

CPU times: user 2.33 s, sys: 112 ms, total: 2.44 s
Wall time: 2.44 s


In [30]:
%%time
X_train_enc = encoder_cat(X_train)

CPU times: user 2min 26s, sys: 5.73 s, total: 2min 32s
Wall time: 3min 29s


In [31]:
%%time
X_test_enc = encoder_cat(X_test)

CPU times: user 1min 8s, sys: 1.22 s, total: 1min 9s
Wall time: 1min 12s


In [35]:
import numpy as np

In [36]:
np.array(X_train_enc).shape

(924491, 306)

In [37]:
np.array(X_test_enc).shape

(396211, 306)

In [38]:
pipe = Pipeline([('scaler', StandardScaler()),
                ('classifier', MLPClassifier())])

In [39]:
%%time
pipe.fit(X_train_enc, y_train)

ValueError: Unknown label type: (array([0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0], dtype=object),)

In [None]:
%%time
prob = pipe.predict_proba(X_test_enc)
score = roc_auc_score(y_test, prob[:,1])

In [4]:
df = pd.read_csv("data/df_key_action.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1320702 entries, 0 to 1320701
Data columns (total 21 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   client_id                 1320702 non-null  float64
 1   visit_time                1320702 non-null  object 
 2   visit_number              1320702 non-null  int64  
 3   hit_number                1320702 non-null  float64
 4   hit_page_path             1320702 non-null  object 
 5   hit_date                  1320702 non-null  object 
 6   hit_time                  569330 non-null   float64
 7   utm_source                1320679 non-null  object 
 8   utm_medium                1320702 non-null  object 
 9   utm_campaign              1197191 non-null  object 
 10  utm_adcontent             1129907 non-null  object 
 11  utm_keyword               575040 non-null   object 
 12  device_category           1320702 non-null  object 
 13  device_os                 5