In [1]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
df = pd.read_json(
    "the-pudding-pockets/measurementRectangles.json"
)  # to not index on user id..as we got duplicates that we need
print(df.columns)

Index(['brand', 'style', 'menWomen', 'name', 'fabric', 'price',
       'maxHeightFront', 'minHeightFront', 'rivetHeightFront', 'maxWidthFront',
       'minWidthFront', 'maxHeightBack', 'minHeightBack', 'maxWidthBack',
       'minWidthBack', 'cutout', 'waistSize', 'updatedStyle', 'group',
       'priceGroup', 'pocketArea', 'rectanglePhone', 'rectanglePen',
       'rectangleWallet', 'rectangleHand', 'rectangeWallet'],
      dtype='object')


In [3]:
df.describe()

Unnamed: 0,price,maxHeightFront,minHeightFront,rivetHeightFront,maxWidthFront,minWidthFront,maxHeightBack,minHeightBack,maxWidthBack,minWidthBack,waistSize,pocketArea
count,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0
mean,80.75,18.729,15.654,6.654,15.777,12.81,15.565,13.026,13.521,11.949,0.0,8424.815
std,44.552,4.887,3.504,0.96,1.47,1.049,0.922,0.949,0.865,0.89,0.0,2699.333
min,9.99,11.5,9.5,4.5,12.0,11.0,13.0,10.5,11.5,9.5,0.0,4454.429
25%,49.95,14.0,13.0,6.0,14.5,12.0,15.0,12.425,13.0,11.5,0.0,5905.184
50%,73.975,20.25,15.0,6.5,16.0,12.6,15.5,13.0,13.5,12.0,0.0,8618.737
75%,95.713,22.5,17.0,7.0,17.0,13.5,16.05,13.7,14.0,12.5,0.0,10724.592
max,249.0,28.0,25.0,9.2,19.0,16.0,17.5,15.0,15.5,14.0,0.0,13102.032


## Assumptions -> can become tests

- women jeans are more expensive
- women pockets are smaller
- women jeans that are more expensive have smaller pockets -> you should wear a bag with if ur jeans are expensive
- create data file with some missing values - > so we can do imputation , for example prices.. get average, or 0 or average per gender

- colums I would add: is_size_smaller_than_average ( per brand / per dataset )



- method: get_pocket_size_score -> return a score based on if women pocket sizes are higher than average... by brand in relation to the rest maybe
- homework: write the test

In [4]:
#transforming categorical columns to categorical type: brand     style menWomen
df["brand"] = df["brand"].astype("category")
df["style"] = df["style"].astype("category")
df["menWomen"] = df["menWomen"].astype("category")

### Assumption 1

In [5]:
df.price.describe()

count    80.000
mean     80.750
std      44.552
min       9.990
25%      49.950
50%      73.975
75%      95.713
max     249.000
Name: price, dtype: float64

In [6]:
bins = [9, 50, 74, 96, 250]
df['price_binned'] = pd.cut(df['price'], bins)

In [7]:
# - women jeans are more expensive
# women jeans are not really more expensive
pd.crosstab(df["menWomen"], df.price_binned)

price_binned,"(9, 50]","(50, 74]","(74, 96]","(96, 250]"
menWomen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
men,13,8,9,10
women,12,7,11,10


### Assumption 2

In [8]:
df.pocketArea.describe()

count      80.000
mean     8424.815
std      2699.333
min      4454.429
25%      5905.184
50%      8618.737
75%     10724.592
max     13102.032
Name: pocketArea, dtype: float64

In [9]:
bins = [4454, 5906, 8619, 10725, 13103]
df['pocket_binned'] = pd.cut(df['pocketArea'], bins)

In [10]:
pd.crosstab(df["menWomen"], df.pocket_binned, normalize="columns")

pocket_binned,"(4454, 5906]","(5906, 8619]","(8619, 10725]","(10725, 13103]"
menWomen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
men,0.0,0.1,0.9,1.0
women,1.0,0.9,0.1,0.0


### Assumption 3

In [11]:
pd.crosstab(df["price_binned"], df.pocket_binned, normalize="columns")

pocket_binned,"(4454, 5906]","(5906, 8619]","(8619, 10725]","(10725, 13103]"
price_binned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(9, 50]",0.2,0.5,0.3,0.25
"(50, 74]",0.25,0.1,0.1,0.3
"(74, 96]",0.2,0.25,0.4,0.15
"(96, 250]",0.35,0.15,0.2,0.3


## Preparing data for imputation

set price to n/a for 10% of the rows

# save dataframe to file

write different imputation methods:
- mean
- avg

In [12]:
df_miss = df.copy()

import random
ix = [row for row in range(df_miss.shape[0])]
for row in random.sample(ix, int(round(.1*len(ix)))):
    df_miss.iat[row, 5] = np.nan

In [13]:
df_miss.price.isnull().sum() 

8

In [14]:
df_miss.price.describe()

count    72.000
mean     78.913
std      43.670
min       9.990
25%      49.900
50%      73.975
75%      95.713
max     249.000
Name: price, dtype: float64

## Imputing with mean value

In [15]:
def impute(series: pd.Series) -> pd.Series:
    mean = series.mean()
    return series.fillna(mean)

In [16]:
df_miss["price"] = impute(df_miss["price"])

In [17]:
df_miss.price.isnull().sum() 

0

## Creating a column with data transformation of another column

colums I would add: is_size_smaller_than_average ( per brand / per dataset )

In [18]:
def is_greater_than_average(series: pd.Series) -> pd.Series :
    avg = series.mean()
    new_series = [0 if x <= avg else 1 for x in series]

    return pd.Series(new_series)

In [19]:
df["size_greater_than_average"] = is_greater_than_average(df["pocketArea"])

In [20]:
is_greater_than_average(df["pocketArea"]).sum()

41

## Pocket size score

method: get_pocket_size_score -> return a score based on if women pocket sizes are higher than average... by brand in relation to the rest maybe

In [21]:
# count number of rows with women and "size_greater_than_average" > 0
# input brand, menWomen, size_greater_than_average -> 80 rows
# output brand, score  -> less ~ 40


aggr = df.groupby(by=["brand", "menWomen"],as_index=False)["size_greater_than_average"].sum()

In [22]:
def get_sum_score_by_brand_and_gender(frame: pd.DataFrame, brand_col="brand", gender_col="menWomen", score_by="size_greater_than_average") -> pd.DataFrame :
    aggr = frame.groupby(by=[brand_col, gender_col],as_index=False)[score_by].sum()

    return aggr

In [24]:
aggr = get_sum_score_by_brand_and_gender(df, "brand", "menWomen", "size_greater_than_average")
aggr

Unnamed: 0,brand,menWomen,size_greater_than_average
0,7 for All Mankind,men,2
1,7 for All Mankind,women,0
2,Abercrombie,men,2
3,Abercrombie,women,2
4,American Eagle,men,2
5,American Eagle,women,0
6,Arizona,men,2
7,Arizona,women,0
8,Buckle Black,men,2
9,Buckle Black,women,0


In [25]:
aggr[aggr.menWomen == "women"]

Unnamed: 0,brand,menWomen,size_greater_than_average
1,7 for All Mankind,women,0
3,Abercrombie,women,2
5,American Eagle,women,0
7,Arizona,women,0
9,Buckle Black,women,0
11,Calvin Klein,women,0
13,Express,women,0
15,Gap,women,0
17,Guess,women,0
19,H&M,women,0


In [26]:
#todo: remove all rows equal to 0 based on a column
# test

## a less all or nothing score

we sort the pocket areas and assing to each row the index of the area value ( values should be 0 - dataframe size)
- brand women score = average position (sum / 2)
- brand men score = average position (sum / 2)
- score = women / men

In [27]:
df_small = df[["brand", "menWomen", "pocketArea"]]

In [28]:
df_small = df_small.sort_values("pocketArea").reset_index(drop=True)

In [29]:
df_small['rank'] = df_small.index + 1

In [30]:
df_small

Unnamed: 0,brand,menWomen,pocketArea,rank
0,Gap,women,4454.429,1
1,Guess,women,4490.844,2
2,Guess,women,4572.572,3
3,Gap,women,4612.939,4
4,Calvin Klein,women,4702.028,5
...,...,...,...,...
75,Arizona,men,12667.656,76
76,Arizona,men,12751.060,77
77,Ralph Lauren,men,12792.598,78
78,Wrangler,men,13079.606,79


In [31]:
aggr = df_small.groupby(by=["brand", "menWomen"],as_index=False)["rank"].sum()

In [32]:
aggr

Unnamed: 0,brand,menWomen,rank
0,7 for All Mankind,men,131
1,7 for All Mankind,women,37
2,Abercrombie,men,111
3,Abercrombie,women,90
4,American Eagle,men,98
5,American Eagle,women,64
6,Arizona,men,153
7,Arizona,women,62
8,Buckle Black,men,115
9,Buckle Black,women,57
