In [64]:
from orion_recommend.datasets import DatasetMap, synthetic, utils
from orion.sources import S3Source
from orion.sources.io import read_csv, write_csv
from orion.sources import RedshiftSource
import pandas as pd
import numpy as np
from scipy import sparse

In [65]:
#source = RedshiftSource(query='SELECT * FROM publish.inventory_lookup WHERE productid IS NULL')
#df_stock = source.read_csv()

with S3Source(key="masters/uploads/customers/1560425511130_Peak_customers.csv", bucket="kilimanjaro-prod-datalake") as s3:
    df_cust = read_csv(s3)

df_trans = read_csv(S3Source(key="masters/uploads/transactions/1560426066385_Peak_transactions.csv", bucket="kilimanjaro-prod-datalake"))
df_prod = read_csv(S3Source(key="masters/uploads/product/1560425499995_Peak_product.csv", bucket="kilimanjaro-prod-datalake"))


KeyboardInterrupt: 

In [None]:
# 30pc of transactions have keys that are not in df_cust
pd.Series(df_trans.customerkey.unique()).isin(df_cust.customerkey).sum()/len(df_trans.customerkey.unique())
# 45pc of item numbers not in df_prod
pd.Series(df_trans.itemnumber.unique()).isin(df_prod.itemnumber).sum()/len(df_trans.itemnumber.unique())

In [4]:
df_cust.drop(df_cust[df_cust.gender=='unknown'].index, axis=0, inplace=True)

In [5]:
# Fix item number
df_prod.itemnumber = pd.to_numeric(df_prod.itemnumber, errors='coerce') 

In [6]:
# Remove keys not in customer and product tables
df_trans = df_trans.iloc[df_trans.customerkey[df_trans.customerkey.isin(df_cust.customerkey)].index,:]
df_trans.reset_index(inplace=True, drop=True)
df_trans = df_trans.iloc[df_trans.itemnumber[df_trans.itemnumber.isin(df_prod.itemnumber)].index,:]

# Reset index before changing keys to integers
df_trans.reset_index(inplace=True, drop=True)
df_prod.reset_index(inplace=True, drop=True)


In [7]:
# Drop nas
df_prod.dropna(inplace=True)
df_prod.itemnumber.unique().shape

(71993,)

In [8]:
print(df_cust.customerkey.unique().shape)
df_cust.dropna(inplace=True)
print(df_cust.customerkey.unique().shape)

print(df_trans.customerkey.unique().shape)
df_trans.dropna(inplace=True)
print(df_trans.customerkey.unique().shape)

(2425998,)
(1692766,)
(1605552,)
(1605201,)


In [9]:
# Drop redundant cols. Fabric all unknown, item colour less info than colourvalue
df_prod.drop(["itemstylecode", "itemquarter", "itemcolour", "fabric"], axis=1, inplace=True)

In [10]:
df_prod.reset_index(inplace=True, drop=True)

In [11]:
df_prod.iloc[:,1:].duplicated().sum()

45306

In [12]:
dups = df_prod.loc[df_prod.iloc[:,1:].duplicated(),:]

In [13]:
df_prod.drop(dups.index, inplace=True)

In [14]:
# Drop columns
df_trans.drop(["salestransactionkey", "salesordernumber","discountpercent", "grosssales",
              'orderdate', 'ordertime', 'shippingdate','grossprofit'], axis=1, inplace=True)

In [15]:
df = df_trans.merge(df_cust, on='customerkey', how = "outer")
df = df.merge(df_prod, on = "itemnumber")

In [16]:
# Rename and drop columns, nas
df.rename(columns={"customerkey":"user_id","itemnumber":"item_id","shipcountry":"country", "brandcode":"brand_id",
                  "colourvalue":"colour", }, inplace=True)
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

In [17]:
df_trans.rename(columns={"customerkey":"user_id","itemnumber":"item_id"}, inplace=True)
df_cust.rename(columns={"customerkey":"user_id"}, inplace=True)
df_prod.rename(columns={"itemnumber":"item_id"}, inplace=True)

In [18]:
df.shape

(2347705, 16)

In [19]:
df.drop_duplicates(inplace=True)

In [20]:
df.shape,df.user_id.unique().shape, df.item_id.unique().shape

((952042, 16), (558421,), (12324,))

### Take subset of data


In [21]:
df_sub = df.sample(frac = 0.10)

In [22]:
cats = df_sub.select_dtypes('object').columns
keys = df_sub[cats].apply(lambda x: x.factorize()[1])
df_sub[cats] = df_sub[cats].apply(lambda x: x.factorize()[0])

In [23]:
df = df_sub

In [24]:
df.drop(['unitssold', 'pricetype'], axis=1, inplace=True)

In [25]:
len(df.item_id.unique())

10456

In [26]:
user_num = df.user_id.unique().shape[0]
item_num = df.item_id.unique().shape[0]

In [27]:
user_num, item_num

(86297, 10456)

In [28]:
#Renumber item ids
ints = [i for i in range(item_num)]
item_keys = [i for i in df.item_id.unique()]
item_key_dict = {i:j for i,j in zip( item_keys, ints)}

# Change keys to integers in dataframes
df.item_id = [item_key_dict[df.item_id.iloc[i]] for i in range(len(df))]

In [29]:
user_cols = ['user_id', 'country','postcode','loyaltyaccount','gender']
users = df[user_cols]
items = df[['item_id',"brand_id","category","colour", "divisioncode","itemcategorycode","itemfamilycode","itemseason","productgroup"]]

In [30]:
interactions = df[['user_id', 'item_id']]

In [31]:
users.drop_duplicates(inplace=True)
items.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  items.drop_duplicates(inplace=True)


In [32]:
users.to_csv("users")
items.to_csv("items")
interactions.to_csv("interactions")

In [33]:
# Train-test split
train = df.sample(frac = 0.8)
ind = df.index.isin(train.index)
test = df[~ind]

In [34]:
train.head()

Unnamed: 0,user_id,country,postcode,item_id,loyaltyaccount,gender,brand_id,category,colour,divisioncode,itemcategorycode,itemfamilycode,itemseason,productgroup
1005548,50334,0,763,1688,1,0,19,0,12,0,0,0,0,41
386967,77401,0,883,2094,1,1,10,0,0,0,0,0,1,27
688481,44447,0,2492,3819,0,1,12,0,7,1,3,0,0,12
1950177,67292,0,757,4122,0,0,215,0,0,1,3,0,1,6
1544380,28537,0,1281,7682,1,1,35,4,12,2,11,0,0,22


In [35]:
np.save("new_fa_train_data",train.to_numpy())
np.save("new_fa_test_data",test.to_numpy())

## Matrices

In [37]:
USER_NUM = len(df.user_id.unique())
country_num = len(df.country.unique())
postcode_num = len(df.postcode.unique())	
item_id_num = len(df.item_id.unique())
loyaltyaccount_num = len(df.loyaltyaccount.unique())
gender_num = len(df.gender.unique())
brand_id_num = len(df.brand_id.unique())
category_num = len(df.category.unique())
colour_num = len(df.colour.unique())
divisioncode_num = len(df.divisioncode.unique())
itemcategorycode_num = len(df.itemcategorycode.unique())
itemfamilycode_num = len(df.itemfamilycode.unique())
itemseason_num = len(df.itemseason.unique())
productgroup_num = len(df.productgroup.unique())

In [39]:
print(USER_NUM,
country_num,
postcode_num,
item_id_num,
loyaltyaccount_num,
gender_num,
brand_id_num,
category_num,
colour_num,
divisioncode_num,
itemcategorycode_num,
itemfamilycode_num,
itemseason_num,
productgroup_num)

86297 56 5550 10456 2 2 310 8 53 3 13 4 3 78


### Item Attribute matrix

In [40]:
features = [   brand_id_num, category_num, colour_num, divisioncode_num, 
            itemcategorycode_num, itemfamilycode_num, itemseason_num, productgroup_num]
names = [  'brand_id',
         'category', 'colour', 'divisioncode', 'itemcategorycode',
         'itemfamilycode', 'itemseason', 'productgroup']
dic = dict(zip(names,features))

In [41]:
matrices = []
for i in dic.keys():
    mat = np.zeros((item_num, dic[i]), dtype = np.int)    
    pair = df.loc[:,["item_id", i]]
    M = np.unique(pair, axis=0)
    for m in M:
        mat[m[0], m[1]] = 1
    print(mat[1])
    matrices.append(mat)

[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0]
[1 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0 0]
[1 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]


In [42]:
[(matrices[i]).shape for i in range(len(matrices))]

[(10456, 310),
 (10456, 8),
 (10456, 53),
 (10456, 3),
 (10456, 13),
 (10456, 4),
 (10456, 3),
 (10456, 78)]

In [43]:
ia_matrix = np.concatenate((matrices), axis=1)

In [44]:
np.save('new_fa_ia_matrix.npy', ia_matrix)

In [45]:
matrices[0].shape

(10456, 310)

### User- attribute matrix

In [46]:
matrices = []
for i in dic.keys():
    mat = np.zeros((USER_NUM, dic[i]), dtype = np.int)
    pair = df.loc[:,["user_id", i]]
    M = np.unique(pair, axis=0)
    for m in M:
        mat[m[0], m[1]] = 1
    print(mat[1])
    matrices.append(mat)

[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0]
[1 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0 0]
[1 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]


In [47]:
fa_matrix = np.concatenate((matrices), axis=1)

In [48]:
fa_matrix.shape[1]

472

In [49]:
[(matrices[i]).shape for i in range(len(matrices))]

[(86297, 310),
 (86297, 8),
 (86297, 53),
 (86297, 3),
 (86297, 13),
 (86297, 4),
 (86297, 3),
 (86297, 78)]

In [50]:
print(np.unique(ia_matrix, axis=0).shape, np.unique(fa_matrix, axis=0).shape)

(10456, 472) (16503, 472)


In [51]:
fa_matrix.shape

(86297, 472)

In [52]:
np.save('new_fa_ua_matrix', fa_matrix)

### User - attribute Matrices

In [53]:
features = [   country_num, postcode_num, gender_num, loyaltyaccount_num]
names = [ "country", "postcode", "gender", "loyaltyaccount" ]
dic = dict(zip(names,features))

In [54]:
matrices = []
for i in dic.keys():
    mat = np.zeros((item_num, dic[i]), dtype = np.int)
    pair = df.loc[:,["item_id", i]]
    M = np.unique(pair, axis=0)
    for m in M:
        mat[m[0], m[1]] = 1
    print(mat[1])
    matrices.append(mat)

[1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 0 ... 0 0 0]
[1 1]
[1 1]


In [59]:
[(matrices[i]).shape for i in range(len(matrices))]

[(10456, 56), (10456, 5550), (10456, 2), (10456, 2)]

In [56]:
user_atts_matrix = np.concatenate((matrices), axis=1)

In [58]:
user_atts_matrix.shape

(10456, 5610)

In [57]:
np.save("item_attributes", user_atts_matrix)

In [66]:
# Ignore postcode

In [73]:
features = [   country_num, gender_num, loyaltyaccount_num]
names = [ "country", "gender", "loyaltyaccount" ]
dic = dict(zip(names,features))

In [74]:
matrices = []
for i in dic.keys():
    mat = np.zeros((USER_NUM, dic[i]), dtype = np.int)
    pair = df.loc[:,["user_id", i]]
    M = np.unique(pair, axis=0)
    for m in M:
        mat[m[0], m[1]] = 1
    print(mat[1])
    matrices.append(mat)

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1]
[1 0]


In [75]:
[(matrices[i]).shape for i in range(len(matrices))]

[(86297, 56), (86297, 2), (86297, 2)]

In [76]:
user_atts_matrix = np.concatenate((matrices), axis=1)

In [77]:
user_atts_matrix.shape

(86297, 60)

In [78]:
np.save("user_attributes_npc", user_atts_matrix)

### User - Item Matrix

In [60]:
mat = np.zeros((user_num, item_num), dtype = np.int32)
pair = df.loc[:,["user_id", "item_id"]]
M = np.unique(pair, axis=0)
for m in M:
    mat[m[0], m[1]] = 1
print(mat[1])

[0 1 0 ... 0 0 0]


In [63]:
np.save("fa_ui_matrix", mat)