In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
import numpy as np
import pandas as pd

from src.utils.logger import logger

In [3]:
dataset = 'electronics'

In [6]:
df = pd.read_csv('../data/{}.csv'.format(dataset), error_bad_lines=False, warn_bad_lines=True, 
                 dtype={'title': 'str', 'brand': 'str'})
logger.info('DF shape: {}'.format(df.shape))

2019-12-05 12:36:11,918 - DF shape: (498196, 9)


In [8]:
df['related']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
498191   NaN
498192   NaN
498193   NaN
498194   NaN
498195   NaN
Name: related, Length: 498196, dtype: float64

### Filter rows and columns 

In [5]:
# Keep only rows with related data
df = df[~df['related'].isnull()].copy()
logger.info('DF shape: {}'.format(df.shape))

2019-12-05 12:36:02,284 - DF shape: (0, 9)


### Exclusions

In [14]:
# Exclude if title is missing
df = df[~df['title'].isnull()].copy()
logger.info('DF shape: {}'.format(df.shape))

In [15]:
df = df[['asin', 'related']].copy()

### Eval on related column

In [20]:
df['related'] = df['related'].apply(eval)

TypeError: eval() arg 1 must be a string, bytes or code object

In [21]:
def get_also_bought_count(related):
    try:
        return len(related['also_bought'])
    except KeyError:
        return -1 

In [22]:
df['also_bought_count'] = df['related'].apply(get_also_bought_count)

In [23]:
pd.set_option('display.max_colwidth', 1000)

In [24]:
df = df[df['also_bought_count'] < 2]

In [25]:
df.shape

(376892, 3)

### Expand on related column

In [None]:
def explode_on_related(df, relationship):
    # Filter on relationship
    df = df[df['related'].apply(lambda x: relationship in x.keys())].copy()
    
    # Get value (list) from relationship dict
    df['related'] = df['related'].apply(lambda x: x[relationship])
    
    # Explode efficiently using numpy
    vals = df['related'].values.tolist()
    lens = [len(val_list) for val_list in vals]
    vals_array = np.repeat(df['asin'], lens)
    exploded_df = pd.DataFrame(np.column_stack((vals_array, np.concatenate(vals))), columns=df.columns)
    
    # Add relationship
    exploded_df['relationship'] = relationship
    
    return exploded_df

In [None]:
also_bought_df = explode_on_related(df, relationship='also_bought')

In [None]:
bought_together_df = explode_on_related(df, relationship='bought_together')

In [None]:
bought_together_df

In [None]:
combined = pd.concat([also_bought_df, bought_together_df], axis=0)

In [None]:
combined['relationship'].value_counts()