In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

In [4]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

## 1. Articles

In [5]:
articles.head()

Ladieswear가 가장 중요한 부분을 차지, Sportwear가 가장 적은 부분.

In [6]:
fig, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=articles, y='index_name', color='orange')
ax.set_xlabel('count by index name')
ax.set_ylabel('index name')
plt.show();

In [7]:
fig, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=articles, y='garment_group_name',
                 color='orange', hue='index_group_name', multiple='stack')
ax.set_xlabel('count by garment group')
ax.set_ylabel('garment group')
plt.show();

In [8]:
articles.groupby(['index_group_name', 'index_name']).count()['article_id']

In [9]:
pd.options.display.max_rows = None
articles.groupby(['product_group_name', 'product_type_name']).count()['article_id']

In [10]:
for col in articles.columns:
    if not 'no' in col and not 'code' in col and not 'id' in col:
        un_n = articles[col].nunique()
        print(f'n of unique {col}: {un_n}')

## 2. Customers

In [11]:
pd.options.display.max_rows = 50
customers.head()

In [21]:
n_unique = customers['customer_id'].nunique()
n_duplicate = customers.shape[0] - n_unique
print(f'There are {n_duplicate} duplicates in cutomers.')
print(f'n unique customers: {n_unique}')

In [18]:
data_postal = customers.groupby('postal_code', as_index=False).count().sort_values('customer_id', ascending=False)
data_postal.head()

In [26]:
most_postal_code = data_postal.iloc[0]['postal_code']
customers[customers['postal_code'] == most_postal_code].head()

In [28]:
sns.set_style('darkgrid')
fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.histplot(data=customers, x='age', bins=50, color='orange')
ax.set_xlabel('Distribution of the customers age')
plt.show();

In [37]:
fig, ax = plt.subplots(figsize=(15, 6))
ax = sns.histplot(data=customers, x='club_member_status', color='orange')
ax.set_xlabel('Distribution of club member status')
plt.show();

In [39]:
customers['fashion_news_frequency'].unique()

In [41]:
customers.loc[~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = 'None'
customers['fashion_news_frequency'].unique()

In [42]:
pie_data = customers[['customer_id', 'fashion_news_frequency']].groupby('fashion_news_frequency').count()

In [46]:
fig, ax = plt.subplots(figsize=(10, 5))
# ax = sns.histplot(data=customers, x='fashion_news_frequency', color='orange')
# ax = sns.pie(data=customers, x='fashion_news_frequency', color='orange')
colors = sns.color_palette('pastel')
ax.pie(pie_data.customer_id, labels=pie_data.index, colors=colors)
ax.set_facecolor('lightgrey')
ax.set_xlabel('Distribution of fashion news frequency')
plt.show();

## 3. Transactions

In [47]:
transactions.head()

In [48]:
pd.set_option('display.float_format', '{:.4f}'.format)
transactions.describe()['price']

In [49]:
transactions.head()

In [50]:
fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.boxplot(data=transactions, x='price', color='orange')
ax.set_xlabel('Price outliers')
plt.show();

top 10 customers by num of transactions

In [51]:
transactions_by_id = transactions.groupby('customer_id').count()

In [52]:
transactions_by_id.sort_values(by='price', ascending=False)['price'][:10]

In [53]:
articles_for_merge = articles[['article_id', 'prod_name', 'product_type_name', 'product_group_name', 'index_name']]
articles_for_merge = transactions[['customer_id', 'article_id', 'price', 't_dat']].merge(articles_for_merge, on='article_id', how='left')

아래 boxplot으로 Lower/Upper/Full에서 가격 분산이 상당히 큼을 알 수 있다. 이건 일부 unique collections와 상대적으로 일상적인 것들의 차이 때문인 것으로 보인다. 일부 고가 악세사리들은악세사리 그룹에 속해 있다.

In [54]:
fig, ax = plt.subplots(figsize=(25, 18))
ax = sns.boxplot(data=articles_for_merge, x='price', y='product_group_name')
ax.set_xlabel('Price outliers', fontsize=22)
ax.set_ylabel('Index names', fontsize=22)
ax.xaxis.set_tick_params(labelsize=22)
ax.yaxis.set_tick_params(labelsize=22)
plt.show();

accessories 제품 그룹별 boxplot을 보면 그룹 내 고가의 이유를 찾을 수 있다.  
가장 큰 이상치는 bags에서 찾을 수 있다. 추가적으로, scarf와 다른 accessory들이 다른 의류에 비해 꽤 비싼 가격을 형성하고 있다.

In [55]:
fig, ax = plt.subplots(figsize=(25, 18))
_ = articles_for_merge[articles_for_merge['product_group_name'] == 'Accessories']
ax = sns.boxplot(data=_, x='price', y='product_type_name')
ax.set_xlabel('Price outliers', fontsize=22)
ax.set_ylabel('Index names', fontsize=22)
ax.xaxis.set_tick_params(labelsize=22)
ax.yaxis.set_tick_params(labelsize=22)
del _
plt.show();

가장 비싼 평균 가격은 Ladieswear, 가장 낮은 평균 가격은 children.

In [59]:
articles_index = articles_for_merge[['index_name', 'price']].groupby('index_name').mean()
fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.barplot(x=articles_index.price, y=articles_index.index, color='orange', alpha=0.8)
ax.set_xlabel('Price by index')
ax.set_ylabel('Index')
plt.show();

In [61]:
articles_index = articles_for_merge[['product_group_name', 'price']].groupby('product_group_name').mean()
fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.barplot(x=articles_index.price, y=articles_index.index, color='orange', alpha=0.8)
ax.set_xlabel('Price by product group')
ax.set_ylabel('Product group')
plt.show()

시간에 따른 제품 그룹별 평균 가격 변화 top5

In [63]:
articles_for_merge['t_dat'] = pd.to_datetime(articles_for_merge['t_dat'])

In [65]:
product_list = ['Shoes', 'Garment Full body', 'Bags', 'Garment Lower body', 'Underwear/nightwear']
colors = ['cadetblue', 'orange', 'mediumspringgreen', 'tomato', 'lightseagreen']
k = 0
fig, ax = plt.subplots(3, 2, figsize=(20, 15))
for i in range(3):
    for j in range(2):
        try:
            product = product_list[k]
            articles_for_merge_product = articles_for_merge[articles_for_merge.product_group_name == product_list[k]]
            series_mean = articles_for_merge_product[['t_dat', 'price']].groupby(
                pd.Grouper(key='t_dat', freq='M')).mean().fillna(0)
            series_std = articles_for_merge_product[['t_dat', 'price']].groupby(
                pd.Grouper(key='t_dat', freq='M')).std().fillna(0)
            ax[i, j].plot(series_mean, linewidth=4, color=colors[k])
            ax[i, j].fill_between(series_mean.index, (series_mean.values - 2 * series_std.values).ravel(),
                                 (series_mean.values + 2 * series_std.values).ravel(), color=colors[k], alpha=0.1)
            ax[i, j].set_title(f'Mean {product_list[k]} price in time')
            ax[i, j].set_xlabel('month')
            ax[i, j].set_xlabel(f'{product_list[k]}')
            k += 1
        except IndexError:
            ax[i, j].set_visible(False)
plt.show();

## 5. Images with description and price

In [67]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [69]:
max_price_ids = transactions[
    transactions.t_dat == transactions.t_dat.max()
].sort_values('price', ascending=False).iloc[:5][['article_id', 'price']]
min_price_ids = transactions[
    transactions.t_dat == transactions.t_dat.min()
].sort_values('price', ascending=True).iloc[:5][['article_id', 'price']]

In [71]:
fig, ax = plt.subplots(1, 5, figsize=(20, 10))
i = 0
for _, data in max_price_ids.iterrows():
    desc = articles[articles['article_id'] == data['article_id']]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 5 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
    img = mpimg.imread(f'../input/h-and-m-personalized-fashion-recommendations/images/0{str(data.article_id)[:2]}/0{int(data.article_id)}.jpg')
    ax[i].imshow(img)
    ax[i].set_title(f'price: {data.price:.2f}')
    ax[i].set_xticks([], [])
    ax[i].set_yticks([], [])
    ax[i].grid(False)
    ax[i].set_xlabel(desc, fontsize=10)
    i += 1
plt.show();

In [77]:
fig, ax = plt.subplots(1, 5, figsize=(20, 10))
i = 0
for _, data in min_price_ids.iterrows():
    desc = articles[articles['article_id'] == data['article_id']]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 5 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
    img = mpimg.imread(f'../input/h-and-m-personalized-fashion-recommendations/images/0{str(data.article_id)[:2]}/0{int(data.article_id)}.jpg')
    ax[i].imshow(img)
    ax[i].set_title(f'price: {data.price:.2f}')
    ax[i].set_xticks([], [])
    ax[i].set_yticks([], [])
    ax[i].grid(False)
    ax[i].set_xlabel(desc, fontsize=10)
    i += 1
plt.show();