In [1]:
!pip install pyarrow
!pip install fastparquet



In [2]:
import datetime
import warnings

import numpy as np
import pandas as pd


def set_up_printing():
    """Sets up display parameters"""

    max_dimensions_display_limit = 1000
    pd.set_option('display.max_columns', max_dimensions_display_limit)
    pd.set_option('display.max_colwidth', None)

    pd.set_option('display.max_rows', max_dimensions_display_limit)
    pd.set_option('display.width', None)

    pd.set_option('display.max_info_columns', max_dimensions_display_limit)
    pd.set_option('display.max_info_rows', max_dimensions_display_limit)

    warnings.filterwarnings('ignore')


def log_status(msg, is_log_enabled):
    if is_log_enabled:
        print(datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), msg)


def groupby_then_apply(dataframe, key_col_name, func, part_size=100):
    parts = np.array_split(dataframe, part_size)

    print(datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), 'всего задач для процессинга: ', str(len(parts)))

    results = []
    outliers = pd.DataFrame()

    print(datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), 'запуск цикла обработки по частям')

    logging_aux_counter = 0
    is_logging_enabled_for_cycle = True

    i = 1
    for part in parts:

        if not is_logging_enabled_for_cycle and logging_aux_counter / len(parts) > 0.003:
            is_logging_enabled_for_cycle = True
            log_status('.. выполнено ' + str(i) + ' из ' + str(len(parts)) + ' задач', is_logging_enabled_for_cycle)
            logging_aux_counter = 0

        log_status('обработка задачи ' + str(i), is_logging_enabled_for_cycle)

        part = pd.concat((outliers, part))

        last_val = part[key_col_name].iloc[-1]
        is_outlier = part[key_col_name] == last_val
        part, outliers = part[~is_outlier], part[is_outlier]

        log_status('размерность задачи до one hot encoding: ' + str(part.shape), is_logging_enabled_for_cycle)
        part = pd.get_dummies(part, columns=['activity'], sparse=True)

        log_status('размерность задачи после one hot encoding: ' + str(part.shape), is_logging_enabled_for_cycle)

        results.append(func(part))

        i = i + 1

        logging_aux_counter = logging_aux_counter + 1
        is_logging_enabled_for_cycle = False

    if len(outliers):
        log_status('обработка последней задачи вне тела цикла', True)
        outliers = pd.get_dummies(outliers, columns=['activity'], sparse=True)
        results.append(func(outliers))

    return pd.concat(results)


def agg(part):
    return part.groupby('user_id', sort=False)[part.columns.tolist()[1:]].sum()


# настроим отображение при выводе в консоль
set_up_printing()

log_status('чтение и очистка данных', True)

# считаем данные и одновременно удалим пустые строки
dataframe = pd.read_parquet(r'C:\Users\M_N_K\kosta_2023_dataton\train_mfti.parquet')

dataframe = dataframe.drop_duplicates()

dataframe = dataframe[dataframe['event_type'].isin(['click_phone','click_response'])]
dataframe = dataframe.groupby('user_id').filter(lambda x: len(x)>30)

dataframe['user_id'] = dataframe['user_id'].fillna(dataframe['cookie_id'])
dataframe['activity'] = dataframe['vacancy_id_'].astype(str) + '_' + dataframe['event_type'].astype(str)
dataframe = dataframe[['user_id', 'activity']]

dataframe = dataframe.sort_values(by='user_id')

log_status('обработка по частям', True)

print('\n')
dataframe.info()
print('\n')

results = groupby_then_apply(
    dataframe,
    key_col_name='user_id',
    func=agg,
    part_size=int(dataframe.shape[0] / 1000)
)

results = results.sparse.to_dense().fillna(0)

log_status('размерность итогового набора после векторизации: ' + str(results.shape), True)

log_status('запись в файл', True)

results.to_parquet(r'C:\Users\M_N_K\kosta_2023_dataton\KNN_users_vectors_min.parquet', index=True)

log_status('результат записан в файловую систему', True)

30/04/2023 01:40:13 чтение и очистка данных
30/04/2023 01:40:27 обработка по частям


<class 'pandas.core.frame.DataFrame'>
Int64Index: 27310 entries, 11715796 to 11775669
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   object
 1   activity  object
dtypes: object(2)
memory usage: 640.1+ KB


30/04/2023 01:40:27 всего задач для процессинга:  27
30/04/2023 01:40:27 запуск цикла обработки по частям
30/04/2023 01:40:27 обработка задачи 1
30/04/2023 01:40:27 размерность задачи до one hot encoding: (1006, 2)
30/04/2023 01:40:27 размерность задачи после one hot encoding: (1006, 789)
30/04/2023 01:40:29 .. выполнено 2 из 27 задач
30/04/2023 01:40:29 обработка задачи 2
30/04/2023 01:40:29 размерность задачи до one hot encoding: (989, 2)
30/04/2023 01:40:29 размерность задачи после one hot encoding: (989, 779)
30/04/2023 01:40:31 .. выполнено 3 из 27 задач
30/04/2023 01:40:31 обработка задачи 3
30/04/2023 01:40:31 размерность задачи до one hot encoding:

In [3]:
check_df = pd.read_parquet(r'C:\Users\M_N_K\kosta_2023_dataton\KNN_users_vectors_min.parquet')
check_df.info(max_cols=100)
check_df[check_df.columns.tolist()[:10]].head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 582 entries, 0066831bd3cf418ba2714b09de355389 to ff86f10a411d478da7e0653319c747b0
Columns: 14698 entries, activity_100285_click_response to activity_258325_click_phone
dtypes: float64(14698)
memory usage: 65.3+ MB


Unnamed: 0_level_0,activity_100285_click_response,activity_100501_click_response,activity_101097_click_phone,activity_101097_click_response,activity_101462_click_response,activity_101487_click_response,activity_101650_click_response,activity_101687_click_response,activity_101822_click_phone,activity_101943_click_response
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0066831bd3cf418ba2714b09de355389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00c841230129452eada20720659ae3c0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
010e886447c14fb191a497df20713932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0138f1289dcc442eb37624a6950d6178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01860ee97b034f9bae74ed183110bc2f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
01b6c1228ed645faa5145785392e8296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01df285b34cb439eac90d0fee9c0b60f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
02911d50ee4b4f938daa378ec05128d3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
048d3eb16c2e41379d42376565b8e7b4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
049235e0c99f49ed8043bdbca8c802e1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
