In [1]:
import pandas as pd
import numpy as np

In [2]:
# Загрузим данные, которые нужно будет объединить.
df_train = pd.read_csv('train', header = None)
df_test = pd.read_csv('test', header = None)

In [3]:
# Посмотрим на train
df_train.head()

Unnamed: 0,0
0,333396 267089 155959 353335 414000 339989 2741...
1,174197 335779 141676 119856 376664 311755 3881...
2,254861 213397 462645 403619 446211 221833 1572...
3,132371 199878 357751 92032 94315 435049 260359...
4,408028 326575 425054 152422 387296 382570 3505...


In [4]:
# Посмотрим на test
df_test.head()

Unnamed: 0,0
0,454758 382341 240893 280388 362253 436737 2576...
1,294661 374689 182272 102025 70927 86844 280488...
2,15296 384304 138151 474141 182935 262020 21145...
3,166224 30240 75272 291190 362083 443361 451254...
4,212560 474729 466936 196474 86487 375368 30801...


In [5]:
# Прочитаем строки из обоих файлов и объединим данные в один файл.
train_data = []
test_data = []

with open('train', 'r') as file:
    for i, str in enumerate(file.readlines()):
        item_id = [int(id) for id in str.split()]
        order = [k for k in range(len(item_id))]
        user_item_order = np.zeros((len(item_id), 3), dtype=np.int32)
        user_item_order[:, 0] = i
        user_item_order[:, 1] = item_id[::-1]
        user_item_order[:, 2] = order
        train_data.append(user_item_order)
                
with open('test', 'r') as file:
    for i, str in enumerate(file.readlines()):
        item_id = [int(id) for id in str.split()]
        order = [k for k in range(len(item_id))]
        user_item_order = np.zeros((len(item_id), 3), dtype=np.int32)
        user_item_order[:, 0] = i + len(train_data)
        user_item_order[:, 1] = item_id[::-1]
        user_item_order[:, 2] = order
        test_data.append(user_item_order)
        
all_data = train_data + test_data       

In [6]:
# Сделаем массив. 
data = np.vstack(all_data)

In [7]:
# Создадим pandas датафрейм.
to_df = {'user': data[:, 0], 'item': data[:, 1], 'order': data[:, 2]}
target_df = pd.DataFrame(data=to_df)

In [8]:
# Посмотрим на первые 60 элементов датафрейма.
target_df.head(60)

Unnamed: 0,user,item,order
0,0,388242,0
1,0,278503,1
2,0,102795,2
3,0,470957,3
4,0,159637,4
5,0,95202,5
6,0,196872,6
7,0,352770,7
8,0,319811,8
9,0,122447,9


In [9]:
# Класс для создания фолдов.

class UsersKFoldPOut():
    def __init__(self, n_folds, p, random_seed=23):
        self.n_folds = n_folds
        self.p = p
        self.random_seed = random_seed
    
    def split(self, df):
        users = df['user'].unique()
        users_count = len(users)
        print(users_count)
        np.random.seed(self.random_seed)
        np.random.shuffle(users)
        
        fold_sizes = np.full(self.n_folds, users_count // self.n_folds, dtype=np.int32)
        fold_sizes[: users_count % self.n_folds] += 1
        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            test_fold_users = users[start:stop]
            test_mask = df['user'].isin(test_fold_users)
            train_mask = ~test_mask
            del test_mask
            test_mask = (df['user'].isin(test_fold_users)) & (df.groupby('user').cumcount() < self.p)
            yield train_mask, test_mask

In [10]:
# Сделаем 3 фолда.
cv = UsersKFoldPOut(n_folds=3, p=1)

for i, (train_mask, test_mask) in enumerate(cv.split(target_df)):
    train = target_df[train_mask]
    test = target_df[test_mask]
    print(f'Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}')

1449998
Fold#0 | Train: 78277333, Test: 483333
Fold#1 | Train: 78277333, Test: 483333
Fold#2 | Train: 78277584, Test: 483332
