update wongnai score

cstorm125 · Feb 27, 2019 · 843c258 · 843c258
1 parent e283cd4
commit 843c258
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Created as part of [pyThaiNLP](https://github.com/PyThaiNLP/) with [ULMFit](http
 
 Models and word embeddings can also be downloaded via [Dropbox](https://www.dropbox.com/sh/lgd8wf5h0eoehzr/AACD0ZnpOiMKQq1N94WmfV-Va?dl=1).
 
-We pretrained a language model with 60,003 embeddings on [Thai Wikipedia Dump](https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2) (perplexity of 51.6376 ) and text classification (micro-averaged F-1 score of 0.60925 on 5-label classification problem. Benchmarked to 0.5109 by [fastText](fasttext.cc) and 0.4976 by LinearSVC on [Wongnai Challenge: Review Rating Prediction](https://www.kaggle.com/c/wongnai-challenge-review-rating-prediction). The language model can also be used to extract text features for other downstream tasks.
+We pretrained a language model with 60,003 embeddings on [Thai Wikipedia Dump](https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2) (perplexity of 51.6376 ) and text classification (micro-averaged F-1 score of 0.60322 on 5-label classification problem. Benchmarked to 0.5109 by [fastText](fasttext.cc) and 0.4976 by LinearSVC on [Wongnai Challenge: Review Rating Prediction](https://www.kaggle.com/c/wongnai-challenge-review-rating-prediction). The language model can also be used to extract text features for other downstream tasks.
 
 ![random word vectors](https://github.com/cstorm125/thai2fit/blob/master/images/random.png?raw=true)
 
@@ -59,7 +59,7 @@ for weight dropout, you want the weights you have put both in '0.rnns.0.module.w
 
 We trained the [ULMFit model](https://arxiv.org/abs/1801.06146) implemented by`thai2fit` for text classification. We use [Wongnai Challenge: Review Rating Prediction](https://www.kaggle.com/c/wongnai-challenge-review-rating-prediction) as our benchmark as it is the only sizeable and publicly available text classification dataset at the time of writing (June 21, 2018). It has 39,999 reviews for training and validation, and 6,203 reviews for testing. 
 
-We achieved validation perplexity at 35.75113 and validation micro F1 score at 0.598 for five-label classification. Micro F1 scores for public and private leaderboards are 0.61451 and 0.60925 respectively (supposedly we could train further with the 15% validation set we did not use), which are state-of-the-art as of the time of writing (June 21, 2018). FastText benchmark based on their own [pretrained embeddings](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md) has the performance of 0.50483 and 0.49366 for public and private leaderboards respectively. See `ulmfit_wongnai.ipynb` for more details.
+We achieved validation perplexity at 35.75113 and validation micro F1 score at 0.598 for five-label classification. Micro F1 scores for public and private leaderboards are 0.59313 and 0.60322 respectively, which are state-of-the-art as of the time of writing (February 27, 2019). FastText benchmark based on their own [pretrained embeddings](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md) has the performance of 0.50483 and 0.49366 for public and private leaderboards respectively. See `ulmfit_wongnai.ipynb` for more details.
 
 # Text Feature Extraction
 

diff --git a/wongnai_cls/classification.ipynb b/wongnai_cls/classification.ipynb
@@ -15,7 +15,7 @@
     "\n",
     "| model     | micro_f1_public | micro_f1_private | \n",
     "|-----------|-----------------|------------------|\n",
-    "| **ULMFit** | **0.59590**          | **0.59731**           |\n",
+    "| **ULMFit** | **0.59313**          | **0.60322**           |\n",
     "| fastText | 0.5145          | 0.5109           |\n",
     "| LinearSVC | 0.5022          | 0.4976           |\n",
     "| Kaggle Score | 0.59139          | 0.58139          |\n",
@@ -449,7 +449,7 @@
    "source": [
     "| model     | micro_f1_public | micro_f1_private | \n",
     "|-----------|-----------------|------------------|\n",
-    "| ULMFit | 0.59590          | 0.59731           |"
+    "| **ULMFit** | **0.59313**          | **0.60322**           |"
    ]
   },
   {
@@ -984,7 +984,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1005,7 +1005,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -1014,7 +1014,7 @@
        "(22562, 22562)"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1026,7 +1026,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -1045,20 +1045,20 @@
        "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>xxbos   เมื่อ วันที่ 22   พฤศจิกายน   ที่ผ่านมา วงใน ได้ จัดกิจกรรม   xxmaj relax   day   xxmaj relax   night   ขึ้น ที่   xxmaj phothalai   ซึ่ง ใน งาน วันนี้ ได้ เริ่มต้น ขึ้น จาก การ ได้   xxmaj tasting   ประเดิม ห้องอาหาร ใหม่ สด ซิ งๆ ที่ เปิด ต้อนรับ   group   ของ</td>\n",
+       "      <td>xxbos   สำหรับ วันนี้ ก็ พา ไป ที่ รร. เซน xxunk จิ ส กรุง เทพ กัน อีกครั้ง นะคะ   โดย จะ เป็นการ ไป ทั้ง ดื่มด่ำ กับ   xxmaj afternoon   xxmaj tea   และ มื้อ ค่ำ กับ เมนู เซ็ต ปู ด้วย ค่ะ   \\n   \\n   โดย เอ นท รี่ นี้ ขอ เป็น การพา ไป กิน อาฟเตอร์ นู นที กัน ก่อน</td>\n",
        "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>xxbos   มีปัญหา เรื่อง บริการ คน เยอะ คง ดูแล ไม่ ทั่วถึง   หรือ การ   xxmaj xxunk   เรื่อง การ บริการ   การสื่อสาร มีปัญหา พนักงาน กับ พนักงาน ไม่ คุย กัน   \\n   \\n   การสื่อสาร   ผม มาถึง ร้าน   16 : 43   น.   โดย ผม เดิน ไป ที่   xxmaj terrace</td>\n",
+       "      <td>xxbos   xxmaj the   restaurant   we   went   to   xxunk   and   services   is   xxmaj sab-sa-ded   ( แซ่บ สะเด็ด ).   xxmaj it   is   near   xxmaj xxunk   xxmaj xxunk   xxunk   and   entrance   5</td>\n",
        "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>xxbos   เนื่องจาก คน พูดถึง มาก - มาก ที่สุด   จน ทน กระแส ความแรง ของ   xxmaj shibuya   xxmaj shabu   ไม่ไหว   วันนี้ ดาว จึง ขอ ตามรอย คุณ เบิร์ด   ( xxmaj user   :   xxmaj xxunk )   ซึ่ง ดาว ได้ อ่า นรี วิว ของ คุณ เบิร์ด แล้ว คิด ว่า จะ ต้องหา โอ กา สมาทาน ให้</td>\n",
-       "      <td>4</td>\n",
+       "      <td>xxbos   ได้ มีโอกาส มา   wongnai   tasting   ที่ ร้าน   xxmaj copper   \\n   พิกัด   ร้าน   xxmaj copper   xxmaj international   xxmaj buffet   อยู่   ชั้น   2   ของ ห้าง   xxmaj the   sense   ( ห้าง นี้ อยู่ ถัดจาก ห้าง เซ็นทรัล</td>\n",
+       "      <td>5</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>xxbos   เปิด รี วิว ประจำสัปดาห์ นี้   ด้วย ชา บู ชั้น กลางๆ   \\n   \\n   สวัสดี เพื่อน ๆ ชาว วงใน กัน อีกครั้ง กับ ผม   xxmaj pednoii   ahha   กับ รี วิว ร้านอาหาร ใน กรุงเทพมหานคร   รี วิว แรก ใน วันนี้ ผม จะ มา นำเสนอ ร้าน ชา บู ที่ มี สาขา เยอะ ไม่ แพ้ ชา บู นางใน</td>\n",
-       "      <td>2</td>\n",
+       "      <td>xxbos   สวัสดี เพื่อน ๆ พี่ ๆ น้องๆ ใน เวป บอร์ด นี้ ทุกท่าน   ตาม อ่าน มา นาน ละ ครับ   วันนี้ ขออนุญาต ลง รี วิว กับ เค้า บ้าง นะ ครับ   ไม่แน่ใจ ว่า ร้าน นี้ เคย มี ใคร ไป ทาน ไร มา รึ ยัง นะ ครับ   ร้าน xxup ginza   อยู่ ระหว่าง อุดม สุข   ซอย 39 กับ 41</td>\n",
+       "      <td>4</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>"
@@ -1165,17 +1165,15 @@
    "source": [
     "```\n",
     "epoch     train_loss  valid_loss  accuracy\n",
-    "1         1.187845    1.158394    0.472803\n",
-    "Total time: 08:20\n",
-    "epoch     train_loss  valid_loss  accuracy\n",
-    "1         0.889035    0.828990    0.629707\n",
-    "Total time: 08:39\n",
+    "1         1.167613    1.109780    0.479079\n",
+    "Total time: 08:22\n",
     "epoch     train_loss  valid_loss  accuracy\n",
-    "1         0.760357    0.751162    0.656904\n",
-    "Total time: 11:40\n",
-    "epoch     train_loss  valid_loss  accuracy\n",
-    "1         0.628719    0.721673    0.669456\n",
-    "Total time: 18:06\n",
+    "1         0.982858    0.979201    0.560669\n",
+    "2         0.870348    0.834990    0.598326\n",
+    "3         0.752523    0.802491    0.629707\n",
+    "4         0.653818    0.715869    0.671548\n",
+    "5         0.559333    0.702696    0.682008\n",
+    "Total time: 46:22\n",
     "```"
    ]
   },
@@ -1188,7 +1186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1204,7 +1202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1213,24 +1211,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({4: 3546, 3: 1751, 5: 673, 2: 206, 1: 27})"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "preds = np.argmax(probs.numpy(),1) + 1\n",
     "Counter(preds)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "submit_df = pd.DataFrame({'reviewID': test_df.reviewID,'rating':preds})\n",
     "submit_df.head()\n",
     "submit_df.to_csv('submit_ulmfit.csv',index=False)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/wongnai_cls/train_wongnai_cls.py b/wongnai_cls/train_wongnai_cls.py
@@ -37,7 +37,7 @@
 #create learner
 config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,
              output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)
-trn_args = dict(bptt=70, drop_mult=0.5, alpha=2, beta=1,max_len=700)
+trn_args = dict(bptt=70, drop_mult=0.5, alpha=2, beta=1,max_len=1400)
 learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)
 learn.opt_func = partial(optim.Adam, betas=(0.7, 0.99))
 learn.callback_fns += [partial(CSVLogger, filename="logs_cls")]
@@ -49,13 +49,13 @@
 learn.freeze_to(-1)
 learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))
 
-#gradual unfreezing
+# #gradual unfreezing
 learn.freeze_to(-2)
-learn.fit_one_cycle(1, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))
-learn.freeze_to(-3)
-learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))
-learn.unfreeze()
-learn.fit_one_cycle(1, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7))
+learn.fit_one_cycle(5, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))
+#learn.freeze_to(-3)
+#learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))
+# learn.unfreeze()
+# learn.fit_one_cycle(1, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7))
 
 learn.save('wongnai_cls')
 print('done')