# データに関する確認

In [1]:
import pathlib
import pandas as pd

## 各ファイルの概要
- articles.csv
- customers.csv
- transactions_train.csv
- sample_submission.csv

### 商品属性データ(articles.csv)
- 全体  
article_id：SKUと同じ認識  

- カテゴリ系  
product_codeとprod_name：商品名(例えば、tシャツ)  
product_type_noとproduct_type_name：小カテゴリ  
product_group_name：中カテゴリ  
department_noとdepartment_name:大カテゴリ  
garment_group_noとgarment_group_name：大カテゴリとほぼ同じ、もう一個上の感じ

- 色系  
graphical_appearance_noとgraphical_appearance_name：写真にある商品色の純度(純色か縞か)  
colour_group_codeとcolour_group_name:商品の実際の色  
perceived_colour_master_idとperceived_colour_master_name:商品の感知された色  
perceived_colour_value_idとperceived_colour_value_name:商品の感知された色の深さ(darkかlight)  

- 性別系  
index_group_noとindex_group_name：想定の使用者性別  
index_codeとindex_name：性別の中の分類(アウトかインナーか)  
section_noとsection_name:性別＋性別の中の分類  

- 解説系  
detail_desc:商品の詳細の説明文

In [3]:
article = pd.read_csv("../../data/articles.csv")
article.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [4]:
article[['perceived_colour_value_id','perceived_colour_value_name','perceived_colour_master_id','perceived_colour_master_name']].head()

Unnamed: 0,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name
0,4,Dark,5,Black
1,3,Light,9,White
2,1,Dusty Light,9,White
3,4,Dark,5,Black
4,3,Light,9,White


In [7]:
article.shape

(105542, 25)

In [5]:
article.dtypes

article_id                       int64
product_code                     int64
prod_name                       object
product_type_no                  int64
product_type_name               object
product_group_name              object
graphical_appearance_no          int64
graphical_appearance_name       object
colour_group_code                int64
colour_group_name               object
perceived_colour_value_id        int64
perceived_colour_value_name     object
perceived_colour_master_id       int64
perceived_colour_master_name    object
department_no                    int64
department_name                 object
index_code                      object
index_name                      object
index_group_no                   int64
index_group_name                object
section_no                       int64
section_name                    object
garment_group_no                 int64
garment_group_name              object
detail_desc                     object
dtype: object

In [14]:
article.agg({'article_id':'nunique','product_code':'nunique','product_group_name':'nunique','department_no':'nunique','garment_group_no':'nunique','graphical_appearance_no':'nunique',
          'colour_group_code':'nunique','perceived_colour_master_id':'nunique','perceived_colour_value_id':'nunique','index_group_no':'nunique','index_code':'nunique'})

article_id                    105542
product_code                   47224
product_group_name                19
department_no                    299
garment_group_no                  21
graphical_appearance_no           30
colour_group_code                 50
perceived_colour_master_id        20
perceived_colour_value_id          8
index_group_no                     5
index_code                        10
dtype: int64

### 消費者属性データ(customers.csv)
- customer_id:消費者ID
- FN: Fashion News newsletterを受信するか
- Active:アカウントのコミュニケーション可能性
- club_member_status: 活躍か
- fashion_news_frequency: fashion_newsを受信する頻度
- age:年齢
- postal_code: 郵便番号

In [5]:
customers = pd.read_csv("../../data/customers.csv")
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [8]:
customers.shape

(1371980, 7)

In [10]:
customers.dtypes

customer_id                object
FN                        float64
Active                    float64
club_member_status         object
fashion_news_frequency     object
age                       float64
postal_code                object
dtype: object

### 購買履歴データ
- 誰がいつ、どのチャネルで何の商品をいくらの値段で買ったか
- sales_channel_id: 2はオンライン、1はリアルストア

In [6]:
transactions = pd.read_csv("../../data/transactions_train.csv")
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [11]:
transactions.shape

(31788324, 5)

In [12]:
transactions.dtypes

t_dat                object
customer_id          object
article_id            int64
price               float64
sales_channel_id      int64
dtype: object

In [13]:
transactions['t_dat'].describe

<bound method NDFrame.describe of 0           2018-09-20
1           2018-09-20
2           2018-09-20
3           2018-09-20
4           2018-09-20
               ...    
31788319    2020-09-22
31788320    2020-09-22
31788321    2020-09-22
31788322    2020-09-22
31788323    2020-09-22
Name: t_dat, Length: 31788324, dtype: object>

### 提出用のデータ
- prediction欄の枠に12個のarticelIDが入るリストをstrに変換する形

In [7]:
sample_submission = pd.read_csv("../../data/sample_submission.csv")

In [15]:
sample_submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [8]:
type(sample_submission['prediction'][0])

str

In [17]:
sample_submission.shape

(1371980, 2)