In [233]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from itertools import product
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, HTML

import time
import sys
import gc
import pickle

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 100)

##

import sys
sys.path.append("..")

##

import asyncio
import time
from dotenv import load_dotenv, find_dotenv
import logging
import backtrace
from datetime import datetime
import json
import os
import nest_asyncio

sys.path.append("../../settler")
from brain.src.lib.postgres import connect as connectDatabase
from brain.src.lib.s3 import connect as connectS3
from brain.src.lib.postgres import getConnection

nest_asyncio.apply()

LOGGER = logging.getLogger(__name__)

backtrace.hook()
load_dotenv(find_dotenv(), verbose=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [234]:
def delRowWithOddValue(df, col, num):
    """Delete rows with a value on `col` with fewer counts than `num`."""
    freq = items[col].value_counts()
    values = freq[freq>num].keys()
    return df[df[col].isin(values)]

def changeRowWithOddValue(old, col, num, replace):
    """Change values with fewer counts than `num` to `replace`."""
    
    freq = items[col].value_counts()
    values = freq[freq<num].keys()
    df = old.copy()
    df[col][df[col].isin(values)] = replace
    return df

def cleanUpOunces(row):
    if pd.isnull(row):
        return np.nan
    if type(row) == float:
        return int(row)
    row = row.replace('oz.','').replace('oz', '').replace('OZ', '').replace('Z', '').replace('0z', '')
    return int(float(row))

In [235]:
%%time

items = pd.read_csv('../../../Downloads/BlackHawk/AB_2018.csv', error_bad_lines=False, warn_bad_lines=True)

items.rename(columns={col:col.lower() for col in items.columns}, inplace=True)

# Dropping upc
# items.drop(['city', 'upc'], inplace=True, axis=1) # Zip will take care of this
items.drop(['date_of_originating_submission'], inplace=True, axis=1)

items['state'] = items['state'].astype(str)
items['package_type'] = items['package_type'].astype(str)
items['product_name'] = items['product_name'].astype(str)
items['brand_family'] = items['brand_family'].astype(str)
items['purchase_location'] = items['purchase_location'].astype(str)
items['brand_name'] = items['brand_name'].astype(str)
items['gender'] = items['gender'].astype(str)
items['zip'] = items['zip'].astype(str)

items['customer_age'] = pd.to_numeric(items['customer_age'], errors='coerce')
items['pack_size'] = pd.to_numeric(items['pack_size'], errors='coerce')
items['customer_number'] = pd.to_numeric(items['customer_number'], errors='coerce')

# Get rid of Canada zipcodes for now
items = items[~items.zip.str.contains('[A-Z]',na=False)]
items['zip'] = items['zip'].astype(int)
items['purchase_price'] = pd.to_numeric(items['purchase_price'], errors='coerce')

items.package_type = items.package_type.str.lower()

# items.set_index('tracking_number')
# shops['id'] = shops['id'].astype(np.int16)

# # Drop these NaN columns
items.drop(['brand_family', 'gender', 'ab_brand_code'], inplace=True, axis=1)

b'Skipping line 309130: expected 20 fields, saw 28\n'


CPU times: user 4.57 s, sys: 715 ms, total: 5.28 s
Wall time: 5.85 s


In [236]:
%%time


# PACK SIZE
items = items[~items.pack_size.isnull()]
assert items.pack_size.isna().sum() < 50, "Expected this error to be small"

# OUNCES
items = items[~items.ounces.isnull()]
items.ounces = items.ounces.apply(cleanUpOunces).astype(np.int)
items = delRowWithOddValue(items, 'ounces', 10)
items['ounces'] = items['ounces'].astype(int)

# def getTotalBottles(row):
#     return row
# items['total_bottles'] = items.apply(getTotalBottles, axis=1)

# STORE
items['store'] = items['zip'].map(str) + ':' +\
    items['purchase_location'].map(str) + 'in ' +\
    items['city'].map(str) + ', ' +\
    items['state'].map(str)
items = delRowWithOddValue(items, 'store', 100)

# PACKAGE
# Create joint column to describe product better than product field
# Much faster than using df.apply.
items['product'] = items['ounces'].map(str) + 'oz ' +\
    items['pack_size'].map(str) + 'pack of ' +\
    items['brand_name'].map(str)
items = delRowWithOddValue(items, 'product', 100)

# # PRODUCT NAME
# items = delRowWithOddValue(items, 'product_name', 100)
# items['product_name'] = items['product_name'].astype(str)

# UPC
items = delRowWithOddValue(items, 'upc', 10)
items['upc'] = items['upc'].astype(str)

# PACKAGE TYPE
items = delRowWithOddValue(items, 'package_type', 30)
items['package_type'] = items['package_type'].astype(str)

# ZIP
items = delRowWithOddValue(items, 'zip', 100)
items['zip'] = items['zip'].astype(str)

# LOCATION
items = changeRowWithOddValue(items, 'purchase_location', 10, np.nan)
items['purchase_location'] = items['purchase_location'].astype(str)

# OTHERS
items = items[~items.purchase_date.isnull()]
items = items[items.product_name!='Invalid UPC Product']

items = items[items.purchase_price<60]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


CPU times: user 3.3 s, sys: 368 ms, total: 3.67 s
Wall time: 3.81 s


In [237]:
# Normalize product names
# display(items.product_name.value_counts())
# names = items.product_name.value_counts().keys()
# vai = set(map(lambda x: x.lower().replace('12pk','12 pack').replace('-',' '), names))
# print(len(vai))
# vai

In [238]:
# items['brand_name'].value_counts().plot(kind='bar',figsize=(20,10))
# items.groupby('brand_name').size().value_counts().plot(kind='bar',figsize=(20,10))

In [239]:
%%time

# Categorize stuff

# https://stackoverflow.com/questions/42196589

final = items.rename(columns={
    'purchase_date': 'date',
})

final.to_csv('../../../Downloads/BlackHawk/FELIPE_AB_2018_CLEAN.csv',
             index=False)

CPU times: user 1.22 s, sys: 72.1 ms, total: 1.3 s
Wall time: 1.4 s


In [135]:
# # Trying to understand what the tracking_number is.

# # a = pricy.groupby(['tracking_number']).agg({'product_name_':'nunique'})
# a.reset_index(inplace=True)
# a[a.product_name_>1]

# items[items.tracking_number==839607498]

# a = items[~items._brand_name.isna()].groupby(['tracking_number']).agg({'_product_name':'nunique'})


# Relationship between product (col which we created) and upc is 1-*
# items.drop_duplicates(['upc','product'])[['upc','product']].sort_values('product')

In [136]:
# ----

In [248]:
%%time

train = pd.read_csv('../../../Downloads/BlackHawk/FELIPE_AB_2018_CLEAN.csv')
train['date'] = pd.to_datetime(train['date']) # .astype('datetime64[ns]')

train = train[train['date'].between('2017-01-01','2019-01-01')]

train['purchase_price'] = train['purchase_price'].astype('float64')
train['ounces'] = train['ounces'].astype(int)
train['zip'] = train['zip'].astype(str)


toEncode = [
    'city',
    'state',
    'package_type',
    'purchase_location',
    'product_name',
    'brand_name',
    'store',
    'product',
]
for col in toEncode:
    # items[col].value_counts().plot(kind='bar',figsize=(20,10))
    le = LabelEncoder()
    train['__%s' % col] = train[col]
    train[col] = le.fit_transform(train[col])
    train[col] = train[col].astype(int)
    # def translatePackage(list):
    #     return dict(zip(list, le.inverse_transform(list)))

_train = train.copy()

CPU times: user 9.4 s, sys: 200 ms, total: 9.6 s
Wall time: 11 s


In [270]:
%%time

train.sort_values(['date','product_name'], inplace=True)

train = _train.copy()

# train.drop('purchase_location', axis=1)

# We are ignoring train.location for now!
zips = pd.DataFrame({'id':train.zip.unique()})
# products = pd.DataFrame({'id':train['brand'].unique()})

# train['product'] = train['brand']

products = train[[
    'brand_name',
    'ounces',
    'pack_size',
#     'package_type', # Not unique per prod!
#     'upc', # Not unique per prod, which is weird.
    'product', # Id
#     '__product_name',
    '__brand_name',
    '__product',
]].drop_duplicates('product')
products.rename(columns={'product':'id'}, inplace=True)


shops = train[[
    'city',
    'state',
    'zip',
    'purchase_location',
    'store', # Id
    '__city',
    '__state',
    '__purchase_location',
    '__store',
]].drop_duplicates('store')
products.rename(columns={'store':'id'}, inplace=True)

sales = train.drop([
    'brand_name',
    'ounces',
    'pack_size',
    '__brand_name',
    '__product',
    #
    'city',
    'state',
    'zip',
    'purchase_location',
    '__city',
    '__state',
    '__purchase_location',
    '__store',
], axis=1)

train['count'] = 1

CPU times: user 119 ms, sys: 10.5 ms, total: 130 ms
Wall time: 133 ms


In [274]:
%%time

import json5
import pprint
import builtins
from src import processShape

# To save memory for now??
# sales = train.copy() # .iloc[-100000:]

sales['id'] = sales.index

# tracking_number for same order

def getAssemblerShape():
    json = open('featuresbev.json5').read()
    obj = json5.loads(json)
    features = []
    for f in obj['features']:
        features.append(f.strip().replace('tblock', 'month_block')
                        .replace('tcount', 'item_cnt_month'))
    obj['features'] = features
    return obj

shape = getAssemblerShape()

processShape(shape, {
    "Sales": sales,
    "Shops": shops,
    "Products": products,
})

# pd.options.display.float_format = '{:,.2f}'.format

{'product': 'Products.id', 'shop': 'Shops.id', '__DATE__': '__DATE__'}


KeyError: 'id'