In [2]:
! pip3 install pyspark seaborn pandas
! pip3 install -U scikit-learn scipy matplotlib
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import seaborn as sns

import sys
from pyspark.sql import SparkSession
import argparse
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [66]:
customers_df = pd.read_csv('./data/customers.csv')
articles_df = pd.read_csv('./data/articles.csv')
transactions_df = pd.read_csv('./data/transactions_train.csv')

### Articles Preprocessing

#### Deduplication

##### Several data points are represented as both nominal and discrete values, 
* Retain only nominal data for processing such as product_type_name, graphical_appearance_name
* Drop discrete data value such as product_type_no, graphical_appearance_no

In [67]:
redundant_columns = [
    'product_code',
    'product_type_no',
    'graphical_appearance_no',
    'colour_group_code',
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_code',
    'index_group_no',
    'section_no',
    'garment_group_no',
    'detail_desc'
]

articles_df.drop(columns = redundant_columns, axis=1, inplace = True)

##### Article categories can be represetned solely using either index_name or index_group_name
* index_name provides further subcategories
* drop index_group_names

In [68]:
articles_df.groupby(['index_group_name', 'index_name']).count()['article_id']

index_group_name  index_name                    
Baby/Children     Baby Sizes 50-98                   8875
                  Children Accessories, Swimwear     4615
                  Children Sizes 134-170             9214
                  Children Sizes 92-140             12007
Divided           Divided                           15149
Ladieswear        Ladies Accessories                 6961
                  Ladieswear                        26001
                  Lingeries/Tights                   6775
Menswear          Menswear                          12553
Sport             Sport                              3392
Name: article_id, dtype: int64

In [69]:
articles_df.drop("index_group_name", axis=1, inplace = True)

#### Missing Values

##### Items where the percieved colour, graphical appearance and colour groups are unknown cannot be properly identified as trending
* Drop articles where key characteristic data is `Unknown`,
    * perceived_colour_value_name is `Unknown`
    * graphical_appearance_name is `Unknown`
    * colour_group_name is `Unknown`
    * perceived_colour_master_name is `Unknown`

In [70]:
df = articles_df.loc[(articles_df['perceived_colour_value_name'] == 'Unknown')]
df.groupby(['perceived_colour_value_name', 'perceived_colour_master_name', 'colour_group_name','graphical_appearance_name']).count()['article_id']

perceived_colour_value_name  perceived_colour_master_name  colour_group_name  graphical_appearance_name
Unknown                      Unknown                       Unknown            Unknown                      28
Name: article_id, dtype: int64

In [71]:
articles_df.drop(articles_df.loc[articles_df['perceived_colour_value_name'] == 'Unknown'].index, inplace=True, axis=0)

In [None]:
articles_df.head()

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name,detail_desc
0,108775015,Strap top,Vest top,Garment Upper body,Solid,Black,Dark,Black,Jersey Basic,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,Strap top,Vest top,Garment Upper body,Solid,White,Light,White,Jersey Basic,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,Strap top (1),Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,Jersey Basic,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,OP T-shirt (Idro),Bra,Underwear,Solid,Black,Dark,Black,Clean Lingerie,Lingeries/Tights,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,OP T-shirt (Idro),Bra,Underwear,Solid,White,Light,White,Clean Lingerie,Lingeries/Tights,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


#### Consolidation

##### Colour features are split between “color value name” and “color master name” (38 categories in all)
* Consolidate colors as colour_group_name
    * drop perceived_colour_value_name
    *  drop perceived_colour_value_name

In [72]:
articles_df.groupby(['perceived_colour_value_name']).count()['article_id']

perceived_colour_value_name
Bright           6471
Dark            42706
Dusty Light     22152
Light           15739
Medium           5711
Medium Dusty    12630
Undefined         105
Name: article_id, dtype: int64

In [73]:
articles_df.groupby(['perceived_colour_master_name'	]).count()['article_id']

perceived_colour_master_name
Beige               5657
Black              22585
Blue               18469
Bluish Green           3
Brown               2269
Green               3526
Grey                8924
Khaki green         3181
Lilac Purple        1100
Metal               2180
Mole                1223
Orange              2734
Pink                9403
Red                 5878
Turquoise           1829
Unknown              657
White              12665
Yellow              3121
Yellowish Green        5
undefined            105
Name: article_id, dtype: int64

In [75]:
articles_df.groupby(['colour_group_name']).count()['article_id']

colour_group_name
Beige               2712
Black              22670
Blue                3308
Bronze/Copper         94
Dark Beige          1084
Dark Blue          12171
Dark Green          2106
Dark Grey           2731
Dark Orange          886
Dark Pink            818
Dark Purple          315
Dark Red            2340
Dark Turquoise       473
Dark Yellow          574
Gold                1377
Green                815
Greenish Khaki      2767
Grey                4487
Greyish Beige        226
Light Beige         3356
Light Blue          3012
Light Green          681
Light Grey          2105
Light Orange        1520
Light Pink          5811
Light Purple         553
Light Red            285
Light Turquoise     1027
Light Yellow         984
Off White           2726
Orange               779
Other                105
Other Blue            51
Other Green          129
Other Orange         153
Other Pink           750
Other Purple          46
Other Red            114
Other Turquoise       14
Other Y

In [43]:
redundant_colur_columns = [
    'perceived_colour_value_name',
    'perceived_colour_master_name'
]

articles_df.drop(columns = redundant_colur_columns, axis=1, inplace = True)

In [1]:
sns.set_style('whitegrid')
sns.set(rc={"figure.figsize":(100, 100)})
sns.set(font_scale=5)
sns.histplot(articles_df, x="perceived_colour_master_name", y="colour_group_name").set(title='index_group_name vs index_name')

NameError: name 'sns' is not defined

#### Correct Inconsistencies

##### Update all categorical data for consisstency
* lower casing
* removing special characters

In [80]:
for x in articles_df.select_dtypes(include=['object']).columns:
    display(articles_df.groupby(x).count())

Unnamed: 0_level_0,article_id,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
prod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
& denim boyfriend lw denim,2,2,2,2,2,2,2,2,2,2,2
& denim jen bermuda shorts,1,1,1,1,1,1,1,1,1,1,1
&den vintage strt ankle trash,1,1,1,1,1,1,1,1,1,1,1
&denim bootcut rw soho,1,1,1,1,1,1,1,1,1,1,1
&denim bootcut rw speed,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
zuly lace blouse,2,2,2,2,2,2,2,2,2,2,2
zupi hw biker+,1,1,1,1,1,1,1,1,1,1,1
zupi hw biker.,1,1,1,1,1,1,1,1,1,1,1
zurich rollerneck,2,2,2,2,2,2,2,2,2,2,2


Unnamed: 0_level_0,article_id,prod_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
product_type_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accessories set,7,7,7,7,7,7,7,7,7,7,7
alice band,6,6,6,6,6,6,6,6,6,6,6
baby bib,3,3,3,3,3,3,3,3,3,3,3
backpack,6,6,6,6,6,6,6,6,6,6,6
bag,1280,1280,1280,1280,1280,1280,1280,1280,1280,1280,1280
...,...,...,...,...,...,...,...,...,...,...,...
wedge,113,113,113,113,113,113,113,113,113,113,113
weekend/gym bag,9,9,9,9,9,9,9,9,9,9,9
wireless earphone case,2,2,2,2,2,2,2,2,2,2,2
wood balls,1,1,1,1,1,1,1,1,1,1,1


Unnamed: 0_level_0,article_id,prod_name,product_type_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
product_group_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accessories,11158,11158,11158,11158,11158,11158,11158,11158,11158,11158,11158
bags,25,25,25,25,25,25,25,25,25,25,25
cosmetic,49,49,49,49,49,49,49,49,49,49,49
fun,2,2,2,2,2,2,2,2,2,2,2
furniture,13,13,13,13,13,13,13,13,13,13,13
garment and shoe care,9,9,9,9,9,9,9,9,9,9,9
garment full body,13287,13287,13287,13287,13287,13287,13287,13287,13287,13287,13287
garment lower body,19796,19796,19796,19796,19796,19796,19796,19796,19796,19796,19796
garment upper body,42734,42734,42734,42734,42734,42734,42734,42734,42734,42734,42734
interior textile,3,3,3,3,3,3,3,3,3,3,3


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
graphical_appearance_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
all over pattern,17165,17165,17165,17165,17165,17165,17165,17165,17165,17165,17165
application/3d,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341
argyle,15,15,15,15,15,15,15,15,15,15,15
chambray,322,322,322,322,322,322,322,322,322,322,322
check,2178,2178,2178,2178,2178,2178,2178,2178,2178,2178,2178
colour blocking,1830,1830,1830,1830,1830,1830,1830,1830,1830,1830,1830
contrast,376,376,376,376,376,376,376,376,376,376,376
denim,4842,4842,4842,4842,4842,4842,4842,4842,4842,4842,4842
dot,681,681,681,681,681,681,681,681,681,681,681
embroidery,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
colour_group_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
beige,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712
black,22670,22670,22670,22670,22670,22670,22670,22670,22670,22670,22670
blue,3308,3308,3308,3308,3308,3308,3308,3308,3308,3308,3308
bronze/copper,94,94,94,94,94,94,94,94,94,94,94
dark beige,1084,1084,1084,1084,1084,1084,1084,1084,1084,1084,1084
dark blue,12171,12171,12171,12171,12171,12171,12171,12171,12171,12171,12171
dark green,2106,2106,2106,2106,2106,2106,2106,2106,2106,2106,2106
dark grey,2731,2731,2731,2731,2731,2731,2731,2731,2731,2731,2731
dark orange,886,886,886,886,886,886,886,886,886,886,886
dark pink,818,818,818,818,818,818,818,818,818,818,818


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
perceived_colour_value_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bright,6471,6471,6471,6471,6471,6471,6471,6471,6471,6471,6471
dark,42706,42706,42706,42706,42706,42706,42706,42706,42706,42706,42706
dusty light,22152,22152,22152,22152,22152,22152,22152,22152,22152,22152,22152
light,15739,15739,15739,15739,15739,15739,15739,15739,15739,15739,15739
medium,5711,5711,5711,5711,5711,5711,5711,5711,5711,5711,5711
medium dusty,12630,12630,12630,12630,12630,12630,12630,12630,12630,12630,12630
undefined,105,105,105,105,105,105,105,105,105,105,105


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,department_name,index_name,section_name,garment_group_name
perceived_colour_master_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
beige,5657,5657,5657,5657,5657,5657,5657,5657,5657,5657,5657
black,22585,22585,22585,22585,22585,22585,22585,22585,22585,22585,22585
blue,18469,18469,18469,18469,18469,18469,18469,18469,18469,18469,18469
bluish green,3,3,3,3,3,3,3,3,3,3,3
brown,2269,2269,2269,2269,2269,2269,2269,2269,2269,2269,2269
green,3526,3526,3526,3526,3526,3526,3526,3526,3526,3526,3526
grey,8924,8924,8924,8924,8924,8924,8924,8924,8924,8924,8924
khaki green,3181,3181,3181,3181,3181,3181,3181,3181,3181,3181,3181
lilac purple,1100,1100,1100,1100,1100,1100,1100,1100,1100,1100,1100
metal,2180,2180,2180,2180,2180,2180,2180,2180,2180,2180,2180


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,index_name,section_name,garment_group_name
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accessories,694,694,694,694,694,694,694,694,694,694,694
accessories boys,10,10,10,10,10,10,10,10,10,10,10
accessories other,1,1,1,1,1,1,1,1,1,1,1
ak bottoms,90,90,90,90,90,90,90,90,90,90,90
ak dresses & outdoor,87,87,87,87,87,87,87,87,87,87,87
...,...,...,...,...,...,...,...,...,...,...,...
young girl s&t,173,173,173,173,173,173,173,173,173,173,173
young girl shoes,287,287,287,287,287,287,287,287,287,287,287
young girl swimwear,175,175,175,175,175,175,175,175,175,175,175
young girl trouser,408,408,408,408,408,408,408,408,408,408,408


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,section_name,garment_group_name
index_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
baby sizes 50-98,8875,8875,8875,8875,8875,8875,8875,8875,8875,8875,8875
"children accessories, swimwear",4615,4615,4615,4615,4615,4615,4615,4615,4615,4615,4615
children sizes 134-170,9214,9214,9214,9214,9214,9214,9214,9214,9214,9214,9214
children sizes 92-140,12007,12007,12007,12007,12007,12007,12007,12007,12007,12007,12007
divided,15149,15149,15149,15149,15149,15149,15149,15149,15149,15149,15149
ladies accessories,6961,6961,6961,6961,6961,6961,6961,6961,6961,6961,6961
ladieswear,25979,25979,25979,25979,25979,25979,25979,25979,25979,25979,25979
lingeries/tights,6775,6775,6775,6775,6775,6775,6775,6775,6775,6775,6775
menswear,12547,12547,12547,12547,12547,12547,12547,12547,12547,12547,12547
sport,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,garment_group_name
section_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
baby boy,1717,1717,1717,1717,1717,1717,1717,1717,1717,1717,1717
baby essentials & complements,4932,4932,4932,4932,4932,4932,4932,4932,4932,4932,4932
baby girl,1760,1760,1760,1760,1760,1760,1760,1760,1760,1760,1760
boys underwear & basics,2034,2034,2034,2034,2034,2034,2034,2034,2034,2034,2034
collaborations,559,559,559,559,559,559,559,559,559,559,559
contemporary casual,1559,1559,1559,1559,1559,1559,1559,1559,1559,1559,1559
contemporary smart,1778,1778,1778,1778,1778,1778,1778,1778,1778,1778,1778
contemporary street,1486,1486,1486,1486,1486,1486,1486,1486,1486,1486,1486
denim men,521,521,521,521,521,521,521,521,521,521,521
divided accessories,1732,1732,1732,1732,1732,1732,1732,1732,1732,1732,1732


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name
garment_group_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accessories,11519,11519,11519,11519,11519,11519,11519,11519,11519,11519,11519
blouses,5837,5837,5837,5837,5837,5837,5837,5837,5837,5837,5837
dressed,908,908,908,908,908,908,908,908,908,908,908
dresses ladies,4874,4874,4874,4874,4874,4874,4874,4874,4874,4874,4874
dresses/skirts girls,1541,1541,1541,1541,1541,1541,1541,1541,1541,1541,1541
jersey basic,8126,8126,8126,8126,8126,8126,8126,8126,8126,8126,8126
jersey fancy,21443,21443,21443,21443,21443,21443,21443,21443,21443,21443,21443
knitwear,7489,7489,7489,7489,7489,7489,7489,7489,7489,7489,7489
outdoor,4501,4501,4501,4501,4501,4501,4501,4501,4501,4501,4501
shirts,2114,2114,2114,2114,2114,2114,2114,2114,2114,2114,2114


In [79]:
articles_df = articles_df.applymap( 
    lambda s: s.lower() if type(s) == str else s)

In [90]:
spec_chars = ["/", "-", ",", "&", ".", "+"]

for char in spec_chars:
    articles_df = articles_df.applymap( 
            lambda s: s.replace(char, ' ')
            if type(s) == str else s)


In [91]:
articles_df = articles_df.applymap( 
    lambda s: '_'.join(s.split())
    if type(s) == str else s)

In [92]:
for x in articles_df.select_dtypes(include=['object']).columns:
    display(articles_df.groupby(x).count())

Unnamed: 0_level_0,article_id,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
prod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100_den_3p_kneehighs,1,1,1,1,1,1,1,1,1,1,1
10_pk_fancy,5,5,5,5,5,5,5,5,5,5,5
10_pk_invisible_sock,1,1,1,1,1,1,1,1,1,1,1
10_pk_rib,1,1,1,1,1,1,1,1,1,1,1
10p_basic_socks,2,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...
zulu_sweater,39,39,39,39,39,39,39,39,39,39,39
zuly_lace_blouse,2,2,2,2,2,2,2,2,2,2,2
zupi_hw_biker,2,2,2,2,2,2,2,2,2,2,2
zurich_rollerneck,2,2,2,2,2,2,2,2,2,2,2


Unnamed: 0_level_0,article_id,prod_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
product_type_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accessories_set,7,7,7,7,7,7,7,7,7,7,7
alice_band,6,6,6,6,6,6,6,6,6,6,6
baby_bib,3,3,3,3,3,3,3,3,3,3,3
backpack,6,6,6,6,6,6,6,6,6,6,6
bag,1280,1280,1280,1280,1280,1280,1280,1280,1280,1280,1280
...,...,...,...,...,...,...,...,...,...,...,...
wedge,113,113,113,113,113,113,113,113,113,113,113
weekend_gym_bag,9,9,9,9,9,9,9,9,9,9,9
wireless_earphone_case,2,2,2,2,2,2,2,2,2,2,2
wood_balls,1,1,1,1,1,1,1,1,1,1,1


Unnamed: 0_level_0,article_id,prod_name,product_type_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
product_group_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accessories,11158,11158,11158,11158,11158,11158,11158,11158,11158,11158,11158
bags,25,25,25,25,25,25,25,25,25,25,25
cosmetic,49,49,49,49,49,49,49,49,49,49,49
fun,2,2,2,2,2,2,2,2,2,2,2
furniture,13,13,13,13,13,13,13,13,13,13,13
garment_and_shoe_care,9,9,9,9,9,9,9,9,9,9,9
garment_full_body,13287,13287,13287,13287,13287,13287,13287,13287,13287,13287,13287
garment_lower_body,19796,19796,19796,19796,19796,19796,19796,19796,19796,19796,19796
garment_upper_body,42734,42734,42734,42734,42734,42734,42734,42734,42734,42734,42734
interior_textile,3,3,3,3,3,3,3,3,3,3,3


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
graphical_appearance_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
all_over_pattern,17165,17165,17165,17165,17165,17165,17165,17165,17165,17165,17165
application_3d,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341
argyle,15,15,15,15,15,15,15,15,15,15,15
chambray,322,322,322,322,322,322,322,322,322,322,322
check,2178,2178,2178,2178,2178,2178,2178,2178,2178,2178,2178
colour_blocking,1830,1830,1830,1830,1830,1830,1830,1830,1830,1830,1830
contrast,376,376,376,376,376,376,376,376,376,376,376
denim,4842,4842,4842,4842,4842,4842,4842,4842,4842,4842,4842
dot,681,681,681,681,681,681,681,681,681,681,681
embroidery,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165,1165


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
colour_group_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
beige,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712
black,22670,22670,22670,22670,22670,22670,22670,22670,22670,22670,22670
blue,3308,3308,3308,3308,3308,3308,3308,3308,3308,3308,3308
bronze_copper,94,94,94,94,94,94,94,94,94,94,94
dark_beige,1084,1084,1084,1084,1084,1084,1084,1084,1084,1084,1084
dark_blue,12171,12171,12171,12171,12171,12171,12171,12171,12171,12171,12171
dark_green,2106,2106,2106,2106,2106,2106,2106,2106,2106,2106,2106
dark_grey,2731,2731,2731,2731,2731,2731,2731,2731,2731,2731,2731
dark_orange,886,886,886,886,886,886,886,886,886,886,886
dark_pink,818,818,818,818,818,818,818,818,818,818,818


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
perceived_colour_value_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bright,6471,6471,6471,6471,6471,6471,6471,6471,6471,6471,6471
dark,42706,42706,42706,42706,42706,42706,42706,42706,42706,42706,42706
dusty_light,22152,22152,22152,22152,22152,22152,22152,22152,22152,22152,22152
light,15739,15739,15739,15739,15739,15739,15739,15739,15739,15739,15739
medium,5711,5711,5711,5711,5711,5711,5711,5711,5711,5711,5711
medium_dusty,12630,12630,12630,12630,12630,12630,12630,12630,12630,12630,12630
undefined,105,105,105,105,105,105,105,105,105,105,105


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,department_name,index_name,section_name,garment_group_name
perceived_colour_master_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
beige,5657,5657,5657,5657,5657,5657,5657,5657,5657,5657,5657
black,22585,22585,22585,22585,22585,22585,22585,22585,22585,22585,22585
blue,18469,18469,18469,18469,18469,18469,18469,18469,18469,18469,18469
bluish_green,3,3,3,3,3,3,3,3,3,3,3
brown,2269,2269,2269,2269,2269,2269,2269,2269,2269,2269,2269
green,3526,3526,3526,3526,3526,3526,3526,3526,3526,3526,3526
grey,8924,8924,8924,8924,8924,8924,8924,8924,8924,8924,8924
khaki_green,3181,3181,3181,3181,3181,3181,3181,3181,3181,3181,3181
lilac_purple,1100,1100,1100,1100,1100,1100,1100,1100,1100,1100,1100
metal,2180,2180,2180,2180,2180,2180,2180,2180,2180,2180,2180


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,index_name,section_name,garment_group_name
department_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accessories,694,694,694,694,694,694,694,694,694,694,694
accessories_boys,10,10,10,10,10,10,10,10,10,10,10
accessories_other,1,1,1,1,1,1,1,1,1,1,1
ak_bottoms,90,90,90,90,90,90,90,90,90,90,90
ak_dresses_outdoor,87,87,87,87,87,87,87,87,87,87,87
...,...,...,...,...,...,...,...,...,...,...,...
young_girl_s_t,173,173,173,173,173,173,173,173,173,173,173
young_girl_shoes,287,287,287,287,287,287,287,287,287,287,287
young_girl_swimwear,175,175,175,175,175,175,175,175,175,175,175
young_girl_trouser,408,408,408,408,408,408,408,408,408,408,408


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,section_name,garment_group_name
index_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
baby_sizes_50_98,8875,8875,8875,8875,8875,8875,8875,8875,8875,8875,8875
children_accessories_swimwear,4615,4615,4615,4615,4615,4615,4615,4615,4615,4615,4615
children_sizes_134_170,9214,9214,9214,9214,9214,9214,9214,9214,9214,9214,9214
children_sizes_92_140,12007,12007,12007,12007,12007,12007,12007,12007,12007,12007,12007
divided,15149,15149,15149,15149,15149,15149,15149,15149,15149,15149,15149
ladies_accessories,6961,6961,6961,6961,6961,6961,6961,6961,6961,6961,6961
ladieswear,25979,25979,25979,25979,25979,25979,25979,25979,25979,25979,25979
lingeries_tights,6775,6775,6775,6775,6775,6775,6775,6775,6775,6775,6775
menswear,12547,12547,12547,12547,12547,12547,12547,12547,12547,12547,12547
sport,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,garment_group_name
section_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
baby_boy,1717,1717,1717,1717,1717,1717,1717,1717,1717,1717,1717
baby_essentials_complements,4932,4932,4932,4932,4932,4932,4932,4932,4932,4932,4932
baby_girl,1760,1760,1760,1760,1760,1760,1760,1760,1760,1760,1760
boys_underwear_basics,2034,2034,2034,2034,2034,2034,2034,2034,2034,2034,2034
collaborations,559,559,559,559,559,559,559,559,559,559,559
contemporary_casual,1559,1559,1559,1559,1559,1559,1559,1559,1559,1559,1559
contemporary_smart,1778,1778,1778,1778,1778,1778,1778,1778,1778,1778,1778
contemporary_street,1486,1486,1486,1486,1486,1486,1486,1486,1486,1486,1486
denim_men,521,521,521,521,521,521,521,521,521,521,521
divided_accessories,1732,1732,1732,1732,1732,1732,1732,1732,1732,1732,1732


Unnamed: 0_level_0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name
garment_group_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accessories,11519,11519,11519,11519,11519,11519,11519,11519,11519,11519,11519
blouses,5837,5837,5837,5837,5837,5837,5837,5837,5837,5837,5837
dressed,908,908,908,908,908,908,908,908,908,908,908
dresses_ladies,4874,4874,4874,4874,4874,4874,4874,4874,4874,4874,4874
dresses_skirts_girls,1541,1541,1541,1541,1541,1541,1541,1541,1541,1541,1541
jersey_basic,8126,8126,8126,8126,8126,8126,8126,8126,8126,8126,8126
jersey_fancy,21443,21443,21443,21443,21443,21443,21443,21443,21443,21443,21443
knitwear,7489,7489,7489,7489,7489,7489,7489,7489,7489,7489,7489
outdoor,4501,4501,4501,4501,4501,4501,4501,4501,4501,4501,4501
shirts,2114,2114,2114,2114,2114,2114,2114,2114,2114,2114,2114


In [33]:
articles_df.head(100)

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name
0,108775015,strap_top,vest_top,garment_upper_body,solid,black,dark,black,jersey_basic,ladieswear,womens_everyday_basics,jersey_basic
1,108775044,strap_top,vest_top,garment_upper_body,solid,white,light,white,jersey_basic,ladieswear,womens_everyday_basics,jersey_basic
2,108775051,strap_top_(1),vest_top,garment_upper_body,stripe,off_white,dusty_light,white,jersey_basic,ladieswear,womens_everyday_basics,jersey_basic
3,110065001,op_t_shirt_(idro),bra,underwear,solid,black,dark,black,clean_lingerie,lingeries_tights,womens_lingerie,under__nightwear
4,110065002,op_t_shirt_(idro),bra,underwear,solid,white,light,white,clean_lingerie,lingeries_tights,womens_lingerie,under__nightwear
...,...,...,...,...,...,...,...,...,...,...,...,...
95,174057037,fleece_pyjama,pyjama_jumpsuit_playsuit,nightwear,stripe,dark_blue,dark,blue,baby_nightwear,baby_sizes_50_98,baby_essentials_&_complements,under__nightwear
96,174057038,fleece_pyjama,pyjama_jumpsuit_playsuit,nightwear,stripe,light_pink,dusty_light,pink,baby_nightwear,baby_sizes_50_98,baby_essentials_&_complements,under__nightwear
97,174057039,fleece_pyjama,pyjama_jumpsuit_playsuit,nightwear,dot,dark_blue,dark,blue,baby_nightwear,baby_sizes_50_98,baby_essentials_&_complements,under__nightwear
98,174057040,fleece_pyjama,pyjama_jumpsuit_playsuit,nightwear,dot,light_pink,dusty_light,pink,baby_nightwear,baby_sizes_50_98,baby_essentials_&_complements,under__nightwear


In [30]:
articles_df.to_csv('./data/transformed_articles_df.csv', index=False)

### Customers Preprocessing

In [None]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [31]:
customers_df.isnull().sum()

customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16009
age                        15861
postal_code                    0
dtype: int64

In [32]:
customers_df.describe()

Unnamed: 0,FN,Active,age
count,476930.0,464404.0,1356119.0
mean,1.0,1.0,36.38696
std,0.0,0.0,14.31363
min,1.0,1.0,16.0
25%,1.0,1.0,24.0
50%,1.0,1.0,32.0
75%,1.0,1.0,49.0
max,1.0,1.0,99.0


In [33]:
for key in ["club_member_status", "fashion_news_frequency"]:
    for val in customers_df[key].unique():
        print(f"{key}={val}")
        display(customers_df[customers_df[key] == val])

club_member_status=ACTIVE


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
...,...,...,...,...,...,...,...
1371974,ffffaff3905b803d1c7e153a1378a5151e1f34f236ba54...,1.0,1.0,ACTIVE,Regularly,21.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...


club_member_status=nan


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code


club_member_status=PRE-CREATE


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
25,000114c6996ef5703a8d455faf2103f8488d3928348e07...,1.0,1.0,PRE-CREATE,Regularly,40.0,364bc321c321e34c5a94d539f8614609aa3d59fc5bbceb...
34,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,,,PRE-CREATE,NONE,55.0,1fd5eb222d4cc8407054f34fdfc7a353b9b71a7735c4d5...
48,0002697f519fce0a41f12929696aad2cda5b3a05248961...,,,PRE-CREATE,NONE,33.0,475170c26e9e4a76c97dc0f3dea0e8c456a8835726c495...
49,00026b2e6fa592951b3f1f5b24fea406a356d23e84fc80...,,,PRE-CREATE,NONE,60.0,db68cb2c2df9bf5161f3220f4d784e1a56e6da3f67e901...
95,00041bd4c669adc077b6488a2d23ee229a9ad962ab3ee7...,,,PRE-CREATE,NONE,58.0,ea731df2fc7625e27cb20ea1a17bb3255ddb125ef35459...
...,...,...,...,...,...,...,...
1371933,fffd5330e36a95750bd023e6f146ebbc4d2838a6e75cb2...,,,PRE-CREATE,NONE,23.0,0f416f49d3fe0e8da43dc61418fadab9406734832617bb...
1371944,fffdfad0d0527fa55b97f0d2f2ae4b2e659de8345bfd73...,,,PRE-CREATE,NONE,22.0,33abca44736680635058d1041795f5eeb1a88237059465...
1371954,fffebb3ec427e5f7a4220d95878d42d4227cb3bc4cfa7e...,,,PRE-CREATE,NONE,53.0,a9622ca8d61a565ccf41ee677a1eabd2cb65c937d38e21...
1371957,fffec7f7e9bc9270d49b39c870feba01d9367c53b31f54...,,,PRE-CREATE,NONE,47.0,bdcce24e89a9f810e2b10efa54e36013c3795e168ddd18...


club_member_status=LEFT CLUB


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
5166,00fa6e1d4a247e2c81996af566b8aafd5cf766121d6906...,,,LEFT CLUB,NONE,39.0,03054b9069cb7830e9084980e96afc5ee220c395659f5d...
5496,0108c5cb6d8a9103de36474ffc70c508fa9c361fc90b05...,,,LEFT CLUB,NONE,53.0,b1d7e3725c3e52e48b03dd9fb1dca2f48272b43ea12c53...
11359,021c897da6d36da705952b4ecc46e641b811e094d67f68...,,,LEFT CLUB,NONE,19.0,f3ed5b9f7351a622b27771a8a774b18d3fdf72e8a2824d...
11437,02206adfebc3ceec651aee86a3cbb7db83bdbd44aff406...,,,LEFT CLUB,NONE,71.0,4e6b8bb7b24bef6613f27a7dff9d1f7e2529e8fb5afe71...
14862,02c3a111a4fce8b061a6baad19f1ca5322c3bea8386253...,,,LEFT CLUB,NONE,29.0,d9cd3c751184e4e1bdd644ff460a7914f0ea412a929125...
...,...,...,...,...,...,...,...
1363277,fe5d81720a2ad64193c11617c7cfd069fc61d22f369837...,,,LEFT CLUB,NONE,61.0,446e24226e1385f8196498423acf5492cc970c35930124...
1366567,fef8818faad84d92289fec9432ca848e56fb87e76073e9...,,,LEFT CLUB,NONE,25.0,79a0e01f5c80c1a5d4b79599a1cda82d911b7454176128...
1367105,ff128a0ed5bde04a8105c5d24fd2d141bca7cd1c3490c1...,,,LEFT CLUB,NONE,44.0,719b891c5f4670362f8926e240c87dd3da7b253b2d8193...
1370919,ffcc4dd5f7d2dc78a86729c8d6133debd17671cbbc52a8...,,,LEFT CLUB,NONE,45.0,1dde45116c3d44d78ec1a94662a328c997d5b5df137dee...


fashion_news_frequency=NONE


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
6,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,,,ACTIVE,NONE,20.0,fe7b8e2b3fafb89ca90db17ffeeae0fd29b795d803f749...
...,...,...,...,...,...,...,...
1371968,ffff4c4e8b57b633c1ddf8fbd53db16b962cf831baf9ed...,,,ACTIVE,NONE,40.0,3b4a300713f8b142836a67caa5b6d5b3f10a7650c06820...
1371972,ffff8f9ecdce722b5bab97fff68a6d1866492209bfe524...,,,ACTIVE,NONE,52.0,0de9d1ec7dc785301ca5fbe8949cfc2cfbd77e7a807270...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...


fashion_news_frequency=Regularly


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
7,00007d2de826758b65a93dd24ce629ed66842531df6699...,1.0,1.0,ACTIVE,Regularly,32.0,8d6f45050876d059c830a0fe63f1a4c022de279bb68ce3...
13,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,1.0,1.0,ACTIVE,Regularly,56.0,b31984b20a8c478de38eaf113c581ff64e63c4242e607b...
14,0000ae1bbb25e04bdc7e35f718e852adfb3fbb72ef38b3...,1.0,1.0,ACTIVE,Regularly,29.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
15,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,1.0,1.0,ACTIVE,Regularly,54.0,ca8ca81e8b5794992144273b0eada83a7e09ec728c1093...
...,...,...,...,...,...,...,...
1371971,ffff7d65748db4d52e48b74c8f83ccb0029fc3bbafa511...,1.0,1.0,ACTIVE,Regularly,20.0,fffd995029b0b165c5627978bb20268fa4bb2257560931...
1371973,ffffa28cd7ab5d1cbbbfe7b582b1c419270cc0539f3dae...,1.0,1.0,ACTIVE,Regularly,22.0,e89f7536a3d1ff494b6324604db93646fca956d85f8e83...
1371974,ffffaff3905b803d1c7e153a1378a5151e1f34f236ba54...,1.0,1.0,ACTIVE,Regularly,21.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...


fashion_news_frequency=nan


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code


fashion_news_frequency=Monthly


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
5040,00f38d434368a5872c481d64d1f3b2289f3ed4d2cb5d99...,1.0,1.0,ACTIVE,Monthly,48.0,fed79dd607669cb0b9205ef909f87ec433f2c49a44f41b...
6412,01318a0ceef3dd69af82d134a96b6b00ee73693bf72cab...,1.0,1.0,ACTIVE,Monthly,34.0,e7d38730139c0bcbe836cb017cc0423b533b6db34c6807...
6523,0136c28ff3d96e5a5ef25cf5281114d80ab51fc89987eb...,1.0,1.0,ACTIVE,Monthly,34.0,991c917260c114d30d7c8ed76ac4306ef0c9f8d5205911...
6560,01387afda07032df0f33f86bdbd1b09df918241ef7db9b...,1.0,1.0,ACTIVE,Monthly,62.0,114f0d0069a2a2cccfe7b72e917d46412b33103c494742...
9771,01cecf8865bc39b0a7c8af386c2aa0cd7c3fc18d61f344...,1.0,1.0,ACTIVE,Monthly,27.0,f680358c8948524daef52657d1d6f791eb0e4b0e057290...
...,...,...,...,...,...,...,...
1366503,fef5935e8f60e59874ddc8692172b6ac0d61c6dc4241a6...,1.0,1.0,ACTIVE,Monthly,46.0,42f5025eb7aa591c5ae52dbbe2e3cbf6bab47e029a3848...
1366979,ff0c9eab0a7e15a3aa5d541dce7a5ab2bc947f69c8ff00...,1.0,1.0,ACTIVE,Monthly,30.0,5a1f05bfffbc43cf1f875a353cd7e2d8c8794fe2752d08...
1367297,ff1b951242dfefe9ff85631499c425d7ba7eb4e164a1ac...,1.0,1.0,ACTIVE,Monthly,30.0,6ab8da56290ff3d778e9d9e98dd0a0feedeb688664aa9c...
1370406,ffb3d2f2b9b3c371e91acdf8438a5306dc04989e7b6192...,1.0,1.0,ACTIVE,Monthly,26.0,0c264688d9d09cc4f8a797d068fd39bb066b8154f146d4...


fashion_news_frequency=None


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
898114,a79d9cbfaceb0d25a91caccfad167d4d6391fd5bb4292b...,1.0,,ACTIVE,,38.0,58b7c5c27f8b5e8118a9786379601a344ab2e141550d93...
1356686,fd2371195b3d046afbb75fb0b9b6a3386275aaae19cd5a...,1.0,1.0,ACTIVE,,64.0,170a318776b1acf12fb92fe25eb39ff88fd363b530573e...


In [34]:
customers_map_df = customers_df.copy()
customers_map_df["club_member_status"] = customers_map_df["club_member_status"].map({
    "ACTIVE": "active",
    "LEFT CLUB": "left_club",
    "PRE-CREATE": "precreate"
})
customers_map_df["fashion_news_frequency"] = customers_map_df["fashion_news_frequency"].map({
    "Regularly": "regularly",
    "Monthly": "monthly"
})
customer_transformer = ColumnTransformer(
    transformers = [
        (
            'encode_categorical',
            OneHotEncoder(),
            ["club_member_status", "fashion_news_frequency"]
        ),
        (
            'impute_age',
            SimpleImputer(),
            ["age"]
        ),
        (
            'passthrough_metadata',
            'passthrough',
            ["customer_id", "postal_code"]
        ),
        (
            'drop_misc',
            'drop',
            ["FN", "Active"]
        )
    ],
    sparse_threshold=0,
    verbose_feature_names_out=False
).fit(customers_map_df)

transformed_customers_df = pd.DataFrame(customer_transformer.transform(customers_map_df), columns=customer_transformer.get_feature_names_out())
display(transformed_customers_df.isnull().sum())
display(transformed_customers_df.describe())
transformed_customers_df
transformed_customers_df.to_csv('./data/transformed_customers_df.csv', index=False)

club_member_status_active           0
club_member_status_left_club        0
club_member_status_precreate        0
club_member_status_nan              0
fashion_news_frequency_monthly      0
fashion_news_frequency_regularly    0
fashion_news_frequency_nan          0
age                                 0
customer_id                         0
postal_code                         0
dtype: int64

Unnamed: 0,club_member_status_active,club_member_status_left_club,club_member_status_precreate,club_member_status_nan,fashion_news_frequency_monthly,fashion_news_frequency_regularly,fashion_news_frequency_nan,age,customer_id,postal_code
count,1371980.0,1371980.0,1371980.0,1371980.0,1371980.0,1371980.0,1371980.0,1371980.0,1371980,1371980
unique,2.0,2.0,2.0,2.0,2.0,2.0,2.0,85.0,1371980,352899
top,1.0,0.0,0.0,0.0,0.0,0.0,1.0,21.0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2c29ae653a9282cce4151bd87643c907644e09541abc28...
freq,1272491.0,1371513.0,1279020.0,1365918.0,1371138.0,894564.0,893722.0,67530.0,1,120303


### Spark Join

In [None]:
# TESTING PURPOSES ONLY
# Perform joins on samples and verify output
spark = SparkSession.builder.getOrCreate()

sample_cust_spark_df = spark.read.format("csv").option("header", "true").load("./spark_test_data/cust_sample.csv")
sample_art_spark_df = spark.read.format("csv").option("header", "true").load("./spark_test_data/articles_sample.csv")
sample_txn_spark_df = spark.read.format("csv").option("header", "true").load("./spark_test_data/txn_sample.csv")

joined_customer_transactions_df = sample_txn_spark_df.join(sample_cust_spark_df, on="customer_id")
joined_sample_df = joined_customer_transactions_df.join(sample_art_spark_df, on=['article_id'], how='inner')

joined_sample_df.write.csv("./spark_test_data/output", mode='overwrite', header=True)

In [36]:
# Perform joins on full df
spark = SparkSession.builder.getOrCreate()

spark_customer_df = spark.read.format("csv").option("header", "true").load("./data/transformed_customers_df.csv")
spark_articles_df = spark.read.format("csv").option("header", "true").load("./data/transformed_articles_df.csv")
spark_transactions_df = spark.read.format("csv").option("header", "true").load("./data/transformed_transactions_df.csv")

joined_cust_trans_df = spark_transactions_df.join(spark_customer_df, on="customer_id")
joined_df = joined_cust_trans_df.join(spark_articles_df, on=['article_id'], how='inner')

In [37]:
joined_df.coalesce(1).write.csv("./data/joined_dataframe", mode='overwrite', header=True)

23/04/26 16:02:37 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 7:=>  (8 + 8) / 23][Stage 8:>    (0 + 0) / 8][Stage 9:>    (0 + 0) / 8]3]

In [26]:
print(joined_df.head(1))



[Row(article_id='0147339034', customer_id='e8769150304d10862f5d4de22e461a4c6e435bd4dcc58fa146949d5e52638c88', t_dat='2018-12-14', price='0.03049152542372881', sales_channel_id='2', FN='1.0', Active='1.0', club_member_status='ACTIVE', fashion_news_frequency='Regularly', age='66', postal_code='9b24635b57ae2535445a00d7534066aa385a17317616d15d61da8527341dddf5', product_code='0147339', prod_name='6P SS BODY', product_type_no='256', product_type_name='Bodysuit', product_group_name='Garment Upper body', graphical_appearance_no='1010001', graphical_appearance_name='All over pattern', colour_group_code='10', colour_group_name='White', perceived_colour_value_id='3', perceived_colour_value_name='Light', perceived_colour_master_id='9', perceived_colour_master_name='White', department_no='6515', department_name='Baby basics', index_code='G', index_name='Baby Sizes 50-98', index_group_no='4', index_group_name='Baby/Children', section_no='44', section_name='Baby Essentials & Complements', garment_gro

                                                                                

In [27]:
print(joined_df.columns)

['article_id', 'customer_id', 't_dat', 'price', 'sales_channel_id', 'FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'product_code', 'prod_name', 'product_type_no', 'product_type_name', 'product_group_name', 'graphical_appearance_no', 'graphical_appearance_name', 'colour_group_code', 'colour_group_name', 'perceived_colour_value_id', 'perceived_colour_value_name', 'perceived_colour_master_id', 'perceived_colour_master_name', 'department_no', 'department_name', 'index_code', 'index_name', 'index_group_no', 'index_group_name', 'section_no', 'section_name', 'garment_group_no', 'garment_group_name', 'detail_desc']
