In [None]:
import pandas as pd
import gzip

In [None]:
pd.set_option('max_colwidth', 1000)

### Read full file and convert to pandas DF

In [None]:
# Functions to parse zipped json as dataframe
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
# Get pandas dataframe from metadata json zip file
df = getDF('../data/meta_Electronics.json.gz')

In [None]:
df

### Read and save to csv row by row

In [None]:
import json
import gzip
import csv

# Functions to read json and write to csv row by row
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def json_to_csv(read_path, write_path): 
    csv_writer = csv.writer(open(write_path, 'w')) 
    i = 0
    for d in parse(read_path):
        if i == 0:
            header = d.keys()
            csv_writer.writerow(header)
            i += 1    
        csv_writer.writerow(d.values())

In [None]:
json_to_csv('../data/meta_Electronics.json.gz', '../data/meta_Electronics.csv')

### Formatting and cleaning titles

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/metadata_categories_only.csv')

In [3]:
df.shape

(9430088, 3)

In [4]:
def get_category_lvl1(category_path_list):
    """
    (Str of list of list(s)) -> str

    Returns the top level category given a string of a list of lists of categories.
    If there are more than one list of categories provided, returns the top level category from the first list.

    >>> get_category_lvl1("[['A', 'B', 'C'], ['D', 'E', 'F', 'G']]")
    'A'
    >>> get_category_lvl1("[['P1', 'P2', 'P3', 'P4']]")
    'P1'
    >>> get_category_lvl1("[['']]")
    ''

    :type category_path_list: str
    :param category_path_list: A string containing a list of at least one list of categories
    :return: A string showing the full category path of the FIRST category in the list (assumed to be primary category)
    """
    try:
        return eval(category_path_list)[0][0]
    except IndexError:
        return 'no_category'
    except TypeError:
        return 'no_category'


def get_category_path(category_path_list):
    """
    (Str of list of list(s)) -> str

    Returns the category path given a string of list of lists of categories.
    If there are more than one list of categories provided, returns the category path from the first list.

    >>> get_category_path("[['A', 'B', 'C'], ['D', 'E', 'F', 'G']]")
    'A -> B -> C'
    >>> get_category_path("[['P1', 'P2', 'P3', 'P4']]")
    'P1 -> P2 -> P3 -> P4'

    :type category_path_list: str
    :param category_path_list: A string containing a list of at least one list of categories
    :return: A string showing the full category path of the FIRST category in the list (assumed to be primary category)
    """
    try:
        return ' -> '.join(eval(category_path_list)[0])
    except IndexError:  # Error if the outer list is empty
        return 'no_category'
    except TypeError:  # Error if the outer list is missing
        return 'no_category'

In [5]:
# Create column for category
df['category_lvl1'] = df['categories'].apply(get_category_lvl1)

In [9]:
df.dropna(subset=['title'], inplace=True)

In [10]:
df.shape

(7997355, 4)

In [13]:
df = df[df['category_lvl1'] != 'no_category']

In [15]:
df = df[df['category_lvl1'] != 'Books']
df = df[df['category_lvl1'] != 'CDs & Vinyl']
df = df[df['category_lvl1'] != 'Movies & TV']

In [16]:
df.shape

(5589903, 4)

In [18]:
df['category_path'] = df['categories'].apply(get_category_path)

In [19]:
# Create df of category path counts
category_path_df = df.groupby('category_path').agg({'title': 'count'})\
    .sort_values(by='title', ascending=False).reset_index()

In [21]:
category_path_df.shape

(17608, 2)

In [37]:
# Exclude categories that are not deepest category
category_path_df.sort_values(by='category_path', inplace=True)
category_path_df['category_path_next'] = category_path_df['category_path'].shift(-1)
category_path_df.fillna('no_comparison', inplace=True)

In [38]:
# Create list of category_paths which are deepest category
category_path_list = []
for i, value in category_path_df.iterrows():
    category_path = value['category_path']
    category_path_next = value['category_path_next']
    if category_path not in category_path_next:
        category_path_list.append(category_path)

In [39]:
len(category_path_list)

11271

In [27]:
# Create df of category_path
category_path_df = pd.DataFrame(category_path_list, columns=['category_path'])

# Keep only rows where the category is in category_df
df = df[df['category_path'].isin(category_path_df['category_path'])]

In [28]:
df.shape

(4612884, 5)

In [30]:
category_path_df = df.groupby('category_path').agg({'title': 'count'}) \
    .sort_values(by='title', ascending=False).reset_index()

# Drop category_paths where the count of titles < 10
category_path_df = category_path_df[category_path_df['title'] >= 10]

In [31]:
# Keep only rows where the category is in category_df
df = df[df['category_path'].isin(category_path_df['category_path'])]

In [32]:
df.shape

(4598807, 5)