# 構造化なデータをニューラルネットワークでの分類
各会社に大量に埋蔵している構造化データーを利用し、新しいサービスに展開

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

PATH='data/csv/'

# ❶データ分析 (Create datasets and Analyse data)


###### 生データーをセットする 

In [3]:
table_names = ['train', 'test']

In [4]:
tables = [pd.read_csv(f'{PATH}{fname}.csv', low_memory=False) for fname in table_names]

In [5]:
from IPython.display import HTML

#### データーセット確認

Feature Space:
* item_id: Unique ID
* category_class: Item category
* sold_price: Price when the item is sold
* price: Price when the item is listed
* area_name: Where the item is listed
* condition: Item condition(Fair/Good/Like New)
* size: Shipping size
* listing_at: listing datetime
* item_tag_hash: Hashed item tag


In [6]:
for t in tables: display(t.head())

Unnamed: 0,item_id,category_class,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
0,7966,0.0,1164,1162,fff,Fair,7,2017-02-01 16:11:18.978516,3ca192bd7558780793444f73366c58d60c9d7775
1,1850,0.0,1005,1004,fff,Fair,3,2017-02-02 14:42:51.693295,fbaacb960902382e4f6c96f2d8f225c24eecadb4
2,7582,2.0,944,944,aaa,Fair,5,2017-02-03 01:55:53.406374,785a7925363bf133a7c5413c563f331c5e02cc69
3,6560,0.0,1068,1067,fff,Fair,7,2017-01-30 23:34:02.268603,f8997252c6e5ae3d950b736e1a81160a2e937a7f
4,7368,3.0,1407,1407,ddd,Good,15,2017-02-03 13:38:37.845010,ca59bcd3da4daac52f3fcfdc0ab963f65cf421bc


Unnamed: 0,item_id,category_class,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
0,6000,,1006,1006,aaa,Like New,4,2017-02-01 13:13:59.048372,dd01903921ea24941c26a48f2cec24e0bb0e8cc7
1,5532,,1149,1147,fff,Fair,3,2017-02-01 08:19:21.532519,784e9240155834852dff458a730cceb50229df32
2,6797,,1044,1042,ddd,Like New,21,2017-02-02 13:27:40.620084,7c9fe6831f52e30e0ede4f8c54fd9bba673e8d8b
3,3325,,1547,1545,kkk,Like New,20,2017-02-02 20:59:30.470107,54c1792c99a96a96a2881600f0cce1d81061e8b8
4,5447,,1015,1014,ddd,Like New,5,2017-01-30 17:43:31.962058,081be7c370bf9e7b4c6e696276c1b2d57623b26b


#### データー値に対し、属性と分布など確認と分析

In [7]:
for t in tables: display(DataFrameSummary(t).summary())

Unnamed: 0,item_id,category_class,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
count,700,700,700,700,,,700,,
mean,5529.53,1.49857,1092.12,1118.31,,,9.40714,,
std,2590.82,1.35902,201.949,298.301,,,7.8042,,
min,1041,0,646,645,,,0,,
25%,3309.25,0,948.75,948,,,4,,
50%,5556.5,1,1059,1058,,,7,,
75%,7617.5,2,1188.25,1193.75,,,12,,
max,9990,4,1874,3181,,,59,,
counts,700,700,700,700,700,700,700,700,700
uniques,700,5,453,446,10,3,41,700,188


Unnamed: 0,item_id,category_class,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
count,300,0,300,300,,,300,,
mean,5455.23,,1099.69,1104.9,,,9.34333,,
std,2638.17,,214.623,235.455,,,7.8786,,
min,1006,,685,684,,,0,,
25%,3090.75,,962.75,961.75,,,4,,
50%,5389,,1051,1050.5,,,7,,
75%,7683.75,,1198,1196.25,,,13,,
max,9981,,1700,2570,,,40,,
counts,300,0,300,300,300,300,300,300,300
uniques,300,0,245,243,10,3,33,300,146


# ❷特徴抽出/特徴エンジニアリング (Data Preprocessing / Feature Engineering)

#### ニューラルネットワークの処理するため、Pandas を利用してデーターを前処理する
Hash data column should be droped. it is not uesful for the item classification.

and should test the time feature for performance of prediction.

In [8]:
train,test = tables

In [9]:
len(train),len(test)

(700, 300)

In [10]:
for df in (train,test):
    df["size"]=df["size"]+.0001
train.category_class = train.category_class.astype(int)

In [11]:
train.head().T.head(40)

Unnamed: 0,0,1,2,3,4
item_id,7966,1850,7582,6560,7368
category_class,0,0,2,0,3
sold_price,1164,1005,944,1068,1407
price,1162,1004,944,1067,1407
area_name,fff,fff,aaa,fff,ddd
condition,Fair,Fair,Fair,Fair,Good
size,7.0001,3.0001,5.0001,7.0001,15.0001
listing_at,2017-02-01 16:11:18.978516,2017-02-02 14:42:51.693295,2017-02-03 01:55:53.406374,2017-01-30 23:34:02.268603,2017-02-03 13:38:37.845010
item_tag_hash,3ca192bd7558780793444f73366c58d60c9d7775,fbaacb960902382e4f6c96f2d8f225c24eecadb4,785a7925363bf133a7c5413c563f331c5e02cc69,f8997252c6e5ae3d950b736e1a81160a2e937a7f,ca59bcd3da4daac52f3fcfdc0ab963f65cf421bc


In [12]:
display(DataFrameSummary(train).summary())

Unnamed: 0,item_id,category_class,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
count,700,700,700,700,,,700,,
mean,5529.53,1.49857,1092.12,1118.31,,,9.40724,,
std,2590.82,1.35902,201.949,298.301,,,7.8042,,
min,1041,0,646,645,,,0.0001,,
25%,3309.25,0,948.75,948,,,4.0001,,
50%,5556.5,1,1059,1058,,,7.0001,,
75%,7617.5,2,1188.25,1193.75,,,12.0001,,
max,9990,4,1874,3181,,,59.0001,,
counts,700,700,700,700,700,700,700,700,700
uniques,700,5,453,446,10,3,41,700,188


In [13]:
test.head().T.head(40)

Unnamed: 0,0,1,2,3,4
item_id,6000,5532,6797,3325,5447
category_class,,,,,
sold_price,1006,1149,1044,1547,1015
price,1006,1147,1042,1545,1014
area_name,aaa,fff,ddd,kkk,ddd
condition,Like New,Fair,Like New,Like New,Like New
size,4.0001,3.0001,21.0001,20.0001,5.0001
listing_at,2017-02-01 13:13:59.048372,2017-02-01 08:19:21.532519,2017-02-02 13:27:40.620084,2017-02-02 20:59:30.470107,2017-01-30 17:43:31.962058
item_tag_hash,dd01903921ea24941c26a48f2cec24e0bb0e8cc7,784e9240155834852dff458a730cceb50229df32,7c9fe6831f52e30e0ede4f8c54fd9bba673e8d8b,54c1792c99a96a96a2881600f0cce1d81061e8b8,081be7c370bf9e7b4c6e696276c1b2d57623b26b


#### 今度はすべての特徴をエンジニアリングし、ニューラルネットワークと互換性のある入力に変換する必要があります。
##### 例えば：カテゴリ変数を連続した整数に変換し、連続する特徴データーを標準のノーマルに正規化するなど

##### カテゴリ変数と連続変数を識別し、 item_idは各行の識別子として機能します。

In [14]:
cat_vars = ['area_name', 'condition']
drop_vars = ['listing_at','item_tag_hash']
contin_vars = ['sold_price', 'price', 'size']
index = 'item_id'
n = len(train); n
for df in (test,train):
    df.set_index(index)


In [15]:
dep = 'category_class'

train = train[cat_vars+contin_vars+[dep, 'item_id']].copy()

In [16]:
test[dep] = 0
test = test[cat_vars+contin_vars+[dep, 'item_id']].copy()

In [17]:
for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered()

In [18]:
apply_cats(test, train)

In [19]:
for v in contin_vars:
    train[v] = train[v].astype('float32')
    test[v] = test[v].astype('float32')

In [20]:
train.head()

Unnamed: 0,area_name,condition,sold_price,price,size,category_class,item_id
0,fff,Fair,1164.0,1162.0,7.0001,0,7966
1,fff,Fair,1005.0,1004.0,3.0001,0,1850
2,aaa,Fair,944.0,944.0,5.0001,2,7582
3,fff,Fair,1068.0,1067.0,7.0001,0,6560
4,ddd,Good,1407.0,1407.0,15.0001,3,7368


In [21]:
test.head()

Unnamed: 0,area_name,condition,sold_price,price,size,category_class,item_id
0,aaa,Like New,1006.0,1006.0,4.0001,0,6000
1,fff,Fair,1149.0,1147.0,3.0001,0,5532
2,ddd,Like New,1044.0,1042.0,21.000099,0,6797
3,kkk,Like New,1547.0,1545.0,20.000099,0,3325
4,ddd,Like New,1015.0,1014.0,5.0001,0,5447


In [22]:
len(train)

700

In [23]:
df, y, nas, mapper = proc_df(train, dep, do_scale=True)

In [24]:
y

array([0, 0, 2, 0, 3, 2, 1, 2, 0, 4, 4, 0, 1, 2, 0, 0, 1, 0, 2, 2, ..., 3, 4, 4, 3, 4, 2, 1, 4, 2, 1, 0, 2,
       3, 4, 0, 0, 0, 1, 1, 3])

In [25]:
df.head(3)

Unnamed: 0,area_name,condition,sold_price,price,size,item_id
0,6,1,0.356165,0.146558,-0.308663,0.941096
1,6,1,-0.431726,-0.383487,-0.821574,-1.421234
2,1,1,-0.733998,-0.58477,-0.565118,0.792774


In [26]:
df_test, _, nas, mapper = proc_df(test, dep, do_scale=True, mapper=mapper, na_dict=nas)

In [27]:
df_test.head(3)

Unnamed: 0,area_name,condition,sold_price,price,size,item_id
0,1,3,-0.42677,-0.376777,-0.693346,0.18172
1,6,1,0.281835,0.096237,-0.821574,0.000953
2,4,3,-0.238469,-0.256008,1.486526,0.489565


In [28]:
mapper

DataFrameMapper(default=False, df_out=False,
        features=[(['sold_price'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['price'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['size'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['item_id'], StandardScaler(copy=True, with_mean=True, with_std=True))],
        input_df=False, sparse=False)

In [29]:
display(DataFrameSummary(df_test).summary())

Unnamed: 0,area_name,condition,sold_price,price,size,item_id
count,300,300,300,300,300,300
mean,5.44,2.11333,0.0374904,-0.0449963,-0.00818214,-0.0287005
std,2.87645,0.758776,1.06352,0.789884,1.01026,1.019
min,1,1,-2.01742,-1.457,-1.20626,-1.74723
25%,3,2,-0.641086,-0.525224,-0.693346,-0.941989
50%,5,2,-0.203782,-0.227493,-0.308663,-0.0542808
75%,8,3,0.524644,0.261457,0.460704,0.832076
max,10,3,3.0122,4.86999,3.92285,1.7194
counts,300,300,300,300,300,300
uniques,10,3,245,243,33,300


In [30]:
display(DataFrameSummary(df).summary())

Unnamed: 0,area_name,condition,sold_price,price,size,item_id
count,700,700,700,700,700,700
mean,5.52571,1.98857,8.97259e-08,-3.11847e-08,1.422e-08,4.21885e-17
std,2.87422,0.782415,1.00072,1.00072,1.00072,1.00072
min,1,1,-2.21067,-1.58783,-1.20626,-1.73371
25%,3,1,-0.71046,-0.571351,-0.693346,-0.857593
50%,6,2,-0.16414,-0.202332,-0.308663,0.0104167
75%,8,3,0.47633,0.25307,0.332476,0.806486
max,10,3,3.87442,6.91972,6.35918,1.72287
counts,700,700,700,700,700,700
uniques,10,3,453,446,41,700


# ❸モデル学習 (Training model)

#### Pytorchを利用して、ニューラルネットワークの分類モデルを生成する。

In [31]:
n = len(train); n

700

#### 10％訓練データーを検証セットとする

In [32]:
train_ratio = 0.9
train_size = int(700 * train_ratio); train_size
#val_idx = list(range(train_size, len(df)))
val_idx = list(range(70, 70+len(df)-train_size))
#val_idx = list(range(0, len(df)-train_size))
#val_idx = get_cv_idxs(n, val_pct=0.1)

In [33]:
df_test.head(2)

Unnamed: 0,area_name,condition,sold_price,price,size,item_id
0,1,3,-0.42677,-0.376777,-0.693346,0.18172
1,6,1,0.281835,0.096237,-0.821574,0.000953


In [34]:
df.head(2)

Unnamed: 0,area_name,condition,sold_price,price,size,item_id
0,6,1,0.356165,0.146558,-0.308663,0.941096
1,6,1,-0.431726,-0.383487,-0.821574,-1.421234


In [35]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y.astype('int'), cat_flds=cat_vars, bs=64,
                                      is_reg=False,is_multi=False,test_df=df_test)

In [36]:
cat_sz = [(c, len(train[c].cat.categories)+1) for c in cat_vars]

In [37]:
cat_sz

[('area_name', 11), ('condition', 4)]

#### カテゴリー特徴データを　embeddingsにし、ニューラルネットワークに学習されます。

In [38]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [39]:
emb_szs

[(11, 6), (4, 2)]

In [40]:
torch.cuda.is_available()

True

In [41]:
len(df.columns)-len(cat_vars)

4

In [42]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),0.06, 5, [100,50], [0.03,0.06],None,True)

#### 生成したニューラルネットワークのモデル情報

In [43]:
m

MixedInputModel(
  (embs): ModuleList(
    (0): Embedding(11, 6)
    (1): Embedding(4, 2)
  )
  (lins): ModuleList(
    (0): Linear(in_features=12, out_features=100, bias=True)
    (1): Linear(in_features=100, out_features=50, bias=True)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True)
    (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True)
  )
  (outp): Linear(in_features=50, out_features=5, bias=True)
  (emb_drop): Dropout(p=0.06)
  (drops): ModuleList(
    (0): Dropout(p=0.03)
    (1): Dropout(p=0.06)
  )
  (bn): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True)
)

#### モデルを訓練する

In [44]:
#m.lr_find()
#m.sched.plot(100)
lr = .001

In [45]:
m.fit(lr, 3)

epoch      trn_loss   val_loss                           
    0      1.584219   1.500864  
    1      1.52992    1.406571                   
    2      1.478474   1.315451                   



[array([1.31545])]

In [46]:
m.fit(lr, 5, cycle_len=1)

epoch      trn_loss   val_loss                   
    0      1.34461    1.273409  
    1      1.320954   1.222514                   
    2      1.29465    1.175169                   
    3      1.271458   1.13659                    
    4      1.249953   1.10072                    



[array([1.10072])]

In [47]:
m.fit(lr, 3, cycle_len=4, cycle_mult=2 )

epoch      trn_loss   val_loss                   
    0      1.173612   1.060457  
    1      1.148898   1.02734                    
    2      1.135133   1.015316                   
    3      1.125446   1.012905                   
    4      1.116269   0.978946                   
    5      1.105387   0.959782                   
    6      1.091594   0.943226                   
    7      1.077832   0.932805                   
    8      1.066356   0.928923                   
    9      1.055314   0.924726                   
    10     1.047221   0.924321                   
    11     1.040968   0.924193                   
    12     1.035887   0.914931                   
    13     1.030873   0.906764                   
    14     1.023861   0.900982                   
    15     1.016582   0.892159                   
    16     1.009121   0.889899                   
    17     1.00014    0.889984                   
    18     0.996287   0.884955                    
    19     0.987

[array([0.87797])]

In [48]:
lr = .001
m.fit(lr, 3, cycle_len=4, cycle_mult=2 )

epoch      trn_loss   val_loss                    
    0      0.955321   0.874549  
    1      0.954342   0.87321                     
    2      0.947974   0.871858                    
    3      0.943007   0.871327                    
    4      0.940072   0.873203                    
    5      0.93679    0.870581                    
    6      0.934371   0.866401                    
    7      0.932308   0.864541                    
    8      0.929752   0.864459                    
    9      0.926266   0.86364                     
    10     0.924161   0.863211                    
    11     0.923041   0.863191                    
    12     0.921245   0.862878                    
    13     0.920835   0.853788                    
    14     0.915697   0.860107                    
    15     0.908872   0.857884                    
    16     0.909275   0.852607                    
    17     0.910392   0.851241                    
    18     0.909803   0.858395                   

[array([0.85299])]

#### ニューラルネットワークを保存

In [57]:
m.save('classification_nn_model')

In [58]:
m.load('classification_nn_model')

# ❹モデル評価 (Validation)

In [49]:
(x,y1)=m.predict_with_targs()

In [50]:
x.shape,y1.shape

((70, 5), (70,))

In [51]:
x[1],y1[1]

(array([-1.18856, -3.34624, -0.6783 , -4.55503, -1.95099], dtype=float32), 0)

#### 実値と予測のモデル性能検証

In [52]:
(np.argmax(x,axis=1),y1)

(array([2, 2, 0, 0, 2, 0, 2, 2, 1, 1, 2, 0, 3, 1, 4, 1, 0, 3, 2, 0, ..., 0, 1, 0, 0, 2, 2, 1, 0, 0, 0, 3, 0,
        3, 0, 2, 2, 0, 0, 0, 0]),
 array([2, 0, 2, 0, 2, 4, 2, 4, 1, 1, 2, 0, 3, 1, 0, 1, 0, 3, 0, 0, ..., 2, 1, 0, 0, 0, 2, 3, 2, 0, 2, 3, 0,
        3, 4, 2, 2, 0, 0, 2, 0]))

In [53]:
y1.shape

(70,)

In [54]:
val = train.iloc[val_idx]
val[['item_id','category_class']]
valpred = pd.DataFrame({'item_id':val.item_id, 'category_class':val.category_class, 'category_class_pred':np.argmax(x,axis=1)})[['item_id', 'category_class','category_class_pred']]
valpred.head(10)

Unnamed: 0,item_id,category_class,category_class_pred
70,4196,2,2
71,4297,0,2
72,3246,2,0
73,1082,0,0
74,3657,2,2
75,9408,4,0
76,9601,2,2
77,8520,4,2
78,1216,1,1
79,4339,1,1


In [55]:
valpred.loc[valpred.category_class == valpred.category_class_pred].shape

(46, 3)

In [56]:
valpred.head(4)

Unnamed: 0,item_id,category_class,category_class_pred
70,4196,2,2
71,4297,0,2
72,3246,2,0
73,1082,0,0


### Accuracy (正解率)

##### Accuracy = (TP + TN) / (TP + TN + FP + FN)

In [59]:
from sklearn.metrics import accuracy_score

In [60]:
accuracy_score(y1,np.argmax(x,axis=1))

0.6571428571428571

### 正解率は65.7%

In [61]:
46/70

0.6571428571428571

### Confusion matrix (多クラスの混同行列)

In [62]:
from sklearn.metrics import confusion_matrix

In [63]:
confusion_matrix(y1,np.argmax(x,axis=1))

array([[18,  0,  6,  0,  1],
       [ 0, 10,  1,  1,  0],
       [ 5,  0, 11,  0,  0],
       [ 1,  3,  0,  6,  0],
       [ 5,  0,  1,  0,  1]])

confusion_matrix(y1,np.argmax(x,axis=1), labels=['0','1','2','3','4'])

# 評価基準の Mean F1 score (F 値)

#### F1 = 2 ✖︎ (precision ✖︎ recall) / (precision + recall)

In [64]:
from sklearn.metrics import f1_score

In [65]:
score = f1_score(y1,np.argmax(x,axis=1),average='macro')

In [66]:
score

0.6046685340802987

## 今回のF1 Score は 60.5%です。

多クラス分類のため今回に下記が利用できないが、一応メモにする

精度 (Precision)

Precision = TP / (TP + FP)

from sklearn.metrics import precision_score

precision_score(y1,np.argmax(x,axis=1))

検出率 (Recall)

Recall = TPR = Sensitivity = TP / (TP + FN)

from sklearn.metrics import recall_score

recall_score(y1,np.argmax(x,axis=1))

# ❺ラベル推定 (Submission of test result)

In [67]:
pred_test=m.predict(is_test=True)

In [68]:
np.argmax(m.predict(True), axis =1)

array([2, 0, 1, 1, 2, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 2, 1, 1, 0, 2, ..., 2, 1, 1, 2, 2, 2, 0, 0, 0, 3, 0, 2,
       3, 0, 3, 0, 3, 1, 2, 1])

In [69]:
sub = pd.DataFrame({'item_id':test.item_id, 'category_class':np.argmax(m.predict(True), axis =1)})[['item_id', 'category_class']]
sub.head(10)

Unnamed: 0,item_id,category_class
0,6000,2
1,5532,0
2,6797,1
3,3325,1
4,5447,2
5,7191,0
6,9326,0
7,7136,0
8,7391,0
9,8131,1


In [70]:
csv_fn=f'{PATH}/tmp/test_submission.csv'
sub.to_csv(csv_fn, index=False)
FileLink(csv_fn)

In [71]:
sub.head(5)

Unnamed: 0,item_id,category_class
0,6000,2
1,5532,0
2,6797,1
3,3325,1
4,5447,2


In [72]:
len(sub)

300