In [1]:
%%capture --no-display
%run /Nvme/Training/MLTrainAthens/JupyterNotebooks/changeNBLayout.py

%cd ~/Dev/Clones/TabsIQ/code/

In [2]:
%reset -f
%matplotlib inline

import pandas as pd
import numpy as np
import sys, os
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

#
# Set pandas display options
#
pd.options.display.width = 256
pd.options.display.max_colwidth = 48
pd.options.display.precision = 3
pd.options.display.max_rows = 120
pd.options.display.max_columns = 32

In [3]:
# Private reference datasets
attr = pd.read_csv('/home/christos/Dev/Clones/TabsIQ/remote/FilesForInfor/Attribution_08072017_clean.csv', sep = ',')
det = pd.read_csv('/home/christos/Dev/Clones/TabsIQ/remote/amine/DataIQ_History_joined.dlm', sep = '|')


In [4]:
import numpy as np

def UPC2SKU(upc_):
    # reseed RandomState with the same value at each call so that we always get the same UPC-SKU map
    np.random.seed(101)
    return dict(zip(upc_.sort_values(), 100000 + np.random.permutation(np.arange(len(upc_)))))

# Testing
# ix = np.random.choice(100, 10, replace = False).astype(int)
# print [(k, v) for i, (k, v) in enumerate(UPC2SKU(attr['UPC'].astype(int)).items()) if i in ix]
# print [(k, v) for i, (k, v) in enumerate(UPC2SKU(attr['UPC'].astype(int)).items()) if i in ix]

def initials(nLetters_ = None):
    ''' Return word initials from a sentence as a single word
    '''
    def ret(words_):
        return ''.join([w[0].upper() for w in words_.split()])[:nLetters_]
    return ret

In [5]:
# 1. Modify attr: TYPE, SEGMENT, SORT, UPC -> SKU (sequence), BASE_ITEM -> nCT fOZ#nnnn
# This generates the input data for the class

attr0 = attr[~attr.isnull().any(axis = 1)].copy()
skuDict = UPC2SKU(attr['UPC'].astype(int))
inits = initials()
sortDict = {v: i for i, v in enumerate(pd.unique(attr0.SORT))}

attr0['SKU'] = attr0['UPC'].map(skuDict)
attr0['SORT'] = attr0['SORT'].map(sortDict)

attr0['SEGMENT'] = attr0['SEGMENT'].map(lambda _: _.upper()[:3])
attr0['BRAND'] = attr0['BRAND'].map(inits)
attr0['COLLECTION'] = attr0['COLLECTION'].map(inits)

attr0['TYPE'] = attr0['TYPE'].map(inits)
attr0['BASE_ITEM'] = attr0['BASE_ITEM'].map(lambda _: ' '.join(_.split()[-4:-1]))
attr0['SIZE'] = attr0['SIZE_RANGE'].map(lambda _: _[0])
attr0.drop(['SUBBRAND', 'CATEGORY', 'SIZE_RANGE', 'UPC'], axis = 1, inplace = True)


In [10]:
# 2. Select from det only SKU, WEEK, GEO, PRICE, REVENUE, PACV, PACV_F, PACV_D, SALES

from pandas.tseries.offsets import Day

det0 = det[det.notnull().any(axis = 1)].copy()
det0['SKU'] = det0['UPC'].astype(int).map(skuDict)
det0['WEEK_ENDING'] = pd.to_datetime(det0['WeekEnding'], format = '%Y-%m-%d') - Day(11272)
det0['WEEK'] = det0['WEEK_ENDING'].dt.week
det0['MONTH'] = det0['WEEK_ENDING'].dt.month
det0['GEO'] = det0['GEO'].map(inits)
det0['PCTF'] = det0['PACV_FeatWODisp'] + det0['PACV_FeatAndDisp']
det0['PCTD'] = det0['PACV_DispWOFeat'] + det0['PACV_FeatAndDisp']

det0.rename(columns = {'Units': 'UNITS', 'Dollars': 'REVENUE', 'ACV': 'PCT'}, inplace = True)
det0 = det0[['SKU', 'GEO', 'WEEK_ENDING', 'WEEK', 'MONTH', 'PCT', 'PCTF', 'PCTD', 'UNITS', 'REVENUE']]

display(det0.sample(10))


Unnamed: 0,SKU,GEO,WEEK_ENDING,WEEK,MONTH,PCT,PCTF,PCTD,UNITS,REVENUE
25021,100705,PTCTA,1983-05-12,19,5,57,0,0,312,1556.39
92217,103670,TWTUC,1983-05-12,19,5,93,0,3,11005,31665.34
12017,103214,TWTUC,1983-06-30,26,6,91,0,0,9017,84305.08
55895,100821,TWTUC,1984-08-09,32,8,88,0,0,3797,26365.02
62430,100770,TWTUC,1983-01-20,3,1,79,0,2,9085,26107.65
93347,103232,TWTUC,1984-07-05,27,7,89,0,2,10022,14994.07
86791,102840,TWTUC,1984-11-15,46,11,74,0,1,4995,14332.07
33955,103776,PTCTA,1984-11-15,46,11,65,34,0,440,2194.96
92721,103693,TTCTA,1985-01-24,4,1,68,0,0,1042,7183.44
98696,100570,TWTUC,1986-01-09,2,1,93,0,0,9597,28089.4


In [11]:
# Join and save
tab = pd.merge(left = det0, right = attr0, how = 'inner', on = 'SKU')
display(tab.sample(10))

Unnamed: 0,SKU,GEO,WEEK_ENDING,WEEK,MONTH,PCT,PCTF,PCTD,UNITS,REVENUE,SEGMENT,TYPE,BRAND,COLLECTION,SORT,BASE_ITEM,SIZE
42551,101920,TWTUC,1984-12-20,51,12,22,0,0,2053,8433.61,SHA,S2,H&S,H&SDS,361,2-IN-1 1CT 8.4OZ,B
21825,103679,TTCTA,1984-05-17,20,5,69,0,0,1526,11282.56,SHA,S2,H&S,H&SDS,361,2-IN-1 1CT 23.7OZ,D
28623,100320,TTCTA,1982-10-28,43,10,69,0,0,2033,14182.21,SHA,SR,H&S,H&SMR,360,REG 1CT 23.7OZ,D
11034,104494,TWTUC,1985-01-31,5,1,1,0,0,65,278.95,SHA,SR,H&S,H&SOL,367,REG 1CT 23.7OZ,D
59380,102503,TWTUC,1983-02-10,6,2,94,0,1,17039,33659.96,SHA,SR,S,SNBR,874,REG 1CT 12.6OZ,C
104894,101854,TWTUC,1985-11-14,46,11,10,0,1,334,426.7,CON,CR,S,SES,840,REG 1CT 30.0OZ,D
48185,101568,TTCTA,1984-08-30,35,8,95,0,0,2845,20971.82,SHA,SR,H&S,H&SGA,385,REG 1CT 23.7OZ,D
67728,104490,PTCTA,1985-05-30,22,5,58,0,0,553,1888.68,SHA,SR,S,SPK,929,REG 1CT 12.6OZ,C
7464,103818,TWTUC,1983-05-19,20,5,55,0,1,4577,22499.33,SHA,S2,H&S,H&SS&S,364,2-IN-1 1CT 13.5OZ,C
16913,104642,TWTUC,1982-10-14,41,10,99,0,0,12451,84318.03,SHA,SR,H&S,H&SC,375,REG 1CT 13.5OZ,C


In [None]:
tab.to_csv('/Nvme/Training/MLTrainAthens/JupyterNotebooks/retailer.csv', sep = '|', index = False)