In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

from src.prep.prep_edges import create_product_pair
from src.utils.logger import logger

In [3]:
df = pd.read_csv('../data/books_edges.csv')
logger.info('DF shape: {}'.format(df.shape))

2019-11-13 11:16:55,976 - DF shape: (4824206, 4)


In [4]:
df.head()

Unnamed: 0,product1,product2,weight,product_pair
0,0000013714,5064341,1.2,0000013714|0005064341
1,0000013714,5476798,1.2,0000013714|0005476798
2,0000013714,6180116,1.2,0000013714|0006180116
3,0000013714,871482215,1.0,0000013714|0871482215
4,000014357X,307346595,1.0,000014357X|0307346595


### Train val split

In [5]:
n_val_samples = int(0.2 * df.shape[0])
logger.info('Eventual required val samples: {:,}'.format(n_val_samples))

2019-11-13 11:16:56,009 - Eventual required val samples: 964,841


In [6]:
# First split to get some test samples
train, val = train_test_split(df, test_size=0.3, random_state=42)
logger.info('Train shape: {}, val shape: {}'.format(train.shape, val.shape))

2019-11-13 11:17:00,231 - Train shape: (3376944, 4), val shape: (1447262, 4)


In [7]:
# Get set of products in train
train_product_set = set(train['product1']).union(set(train['product2']))
logger.info('No. of unique products in train: {:,}'.format(len(train_product_set)))

2019-11-13 11:17:03,134 - No. of unique products in train: 1,111,436


In [8]:
# Only keep val where both products are in train product set
val = val[(val['product1'].isin(train_product_set)) & (val['product2'].isin(train_product_set))]
logger.info('Updated val shape: {}'.format(val.shape))

2019-11-13 11:17:05,396 - Updated val shape: (1302669, 4)


In [9]:
# Split again to only get n_val_samples
_, val = train_test_split(val, test_size=n_val_samples, random_state=42)
logger.info('Final val shape: {}'.format(val.shape))

2019-11-13 11:17:06,471 - Final val shape: (964841, 4)


In [10]:
# Get train set
train = df[~df.index.isin(set(val.index))].copy()
logger.info('Final train shape: {}'.format(train.shape))

2019-11-13 11:17:08,120 - Final train shape: (3859365, 4)


### Create negative samples

In [11]:
valid_product_pairs = set(df['product_pair'])
logger.info('No. of valid product pairs: {:,}'.format(len(valid_product_pairs)))

2019-11-13 11:17:09,460 - No. of valid product pairs: 4,824,206


In [12]:
val_product_set = set(val['product1']).union(set(val['product2']))
logger.info('No. of unique products in val: {:,}'.format(len(val_product_set)))

2019-11-13 11:17:10,323 - No. of unique products in val: 596,664


In [13]:
def get_sample(item_array, n_iter=None, sample_size=2):
    np.random.seed(42)
    n = len(item_array)
    
    # find the index we last sampled from
    start_idx = (n_iter * sample_size) % n
    if (start_idx + sample_size >= n) or (start_idx <= sample_size):
        
        # shuffle array if we have reached the end and repeat again
        np.random.shuffle(item_array)

    return item_array[start_idx:start_idx+sample_size] 
    
def collect_samples(item_array, sample_size, n_samples):
    samples = []
    
    for i in range(0, n_samples):
        if i % 100000 == 0:
            logger.info('Neg sample: {:,}'.format(i))
            
        sample = get_sample(item_array, n_iter=i, sample_size=sample_size)
        samples.append(sample)
        
    return samples

In [14]:
val_product_arr = np.array(list(val_product_set))
logger.info('No. of unique products in val: {:,}'.format(val_product_arr.shape[0]))

2019-11-13 11:17:11,065 - No. of unique products in val: 596,664


In [15]:
neg_samples = collect_samples(val_product_arr, sample_size=2, n_samples=int(1.1*n_val_samples))

2019-11-13 11:17:11,072 - Neg sample: 0
2019-11-13 11:17:11,499 - Neg sample: 100,000
2019-11-13 11:17:11,819 - Neg sample: 200,000
2019-11-13 11:17:12,253 - Neg sample: 300,000
2019-11-13 11:17:12,568 - Neg sample: 400,000
2019-11-13 11:17:12,878 - Neg sample: 500,000
2019-11-13 11:17:13,337 - Neg sample: 600,000
2019-11-13 11:17:13,661 - Neg sample: 700,000
2019-11-13 11:17:13,982 - Neg sample: 800,000
2019-11-13 11:17:14,472 - Neg sample: 900,000
2019-11-13 11:17:14,789 - Neg sample: 1,000,000


In [16]:
neg_samples_df = pd.DataFrame(neg_samples, columns=['product1', 'product2'])
neg_samples_df.dropna(inplace=True)
neg_samples_df = create_product_pair(neg_samples_df, col_list=['product1', 'product2'])
logger.info('No. of negative samples: {:,}'.format(neg_samples_df.shape[0]))

2019-11-13 11:17:18,810 - No. of negative samples: 1,061,325


In [17]:
# Exclude neg samples that are valid pairs
neg_samples_df = neg_samples_df[~neg_samples_df['product_pair'].isin(valid_product_pairs)].copy()
logger.info('Updated no. of negative samples: {:,}'.format(neg_samples_df.shape[0]))

2019-11-13 11:17:22,027 - Updated no. of negative samples: 1,061,291


In [18]:
# Only keep no. of val samples required
neg_samples_df = neg_samples_df.iloc[:n_val_samples].copy()
logger.info('Final no. of negative samples: {:,}'.format(neg_samples_df.shape[0]))

2019-11-13 11:17:22,112 - Final no. of negative samples: 964,841


In [19]:
neg_samples_df['edge'] = 0
val['edge'] = 1

### Only keep required cols

In [20]:
VAL_COLS = ['product1', 'product2', 'edge']
neg_samples_df = neg_samples_df[VAL_COLS].copy()
val = val[VAL_COLS].copy()
logger.info('Val shape: {}, Neg samples shape: {}, Pos/Total ratio: {}'.format(val.shape, neg_samples_df.shape, val.shape[0]/(val.shape[0] + neg_samples_df.shape[0])))

2019-11-13 11:17:22,486 - Val shape: (964841, 3), Neg samples shape: (964841, 3), Pos/Total ratio: 0.5


In [21]:
val = pd.concat([val, neg_samples_df])
logger.info('Final val shape: {}'.format(val.shape))

2019-11-13 11:17:22,639 - Final val shape: (1929682, 3)


In [22]:
train = train[['product1', 'product2', 'weight']].copy()

### Save

In [23]:
path = 'data/books_edges.csv'

In [24]:
from pathlib import Path

In [25]:
input_filename = Path(path).resolve().stem

In [26]:
train_path = 'data/{}_train.csv'.format(input_filename)
val_path = 'data/{}_val.csv'.format(input_filename)

In [27]:
train_path

'data/books_edges_train.csv'

In [28]:
val_path

'data/books_edges_val.csv'