got 72946 on sentiment testb with 31 rnn models + 2 bert models

chenghuige · Nov 16, 2018 · 4a5ca4d · 4a5ca4d
1 parent 2a797ef
commit 4a5ca4d
Show file tree

Hide file tree

Showing 280 changed files with 10,822 additions and 298 deletions.
diff --git a/projects/ai2018/sentiment/algos/config.py b/projects/ai2018/sentiment/algos/config.py
@@ -32,6 +32,9 @@
 # -2,-1,0,1 -> 0,1,2,3
 NUM_CLASSES = 4
 
+flags.DEFINE_bool('use_soft_label', False, '')
+flags.DEFINE_bool('use_len', False, 'wether add length as a feature')
+
 flags.DEFINE_string('pretrain_encoder', 'bilm', 'bilm or bert')
 flags.DEFINE_bool('transformer_add_rnn', False, '')
 

diff --git a/projects/ai2018/sentiment/algos/model.py b/projects/ai2018/sentiment/algos/model.py
@@ -124,6 +124,9 @@ def __init__(self, embedding=None, lm_model=False, use_text_encoder=True):
       else:
         self.dense = None
 
+      if FLAGS.use_len:
+        self.len_embedding = wenzheng.Embedding(3000, 32)
+
       self.num_classes = NUM_CLASSES if FLAGS.binary_class_index is None else 1
       if FLAGS.loss_type == 'regression':
         self.num_classes = 1
@@ -203,6 +206,7 @@ def call(self, input, training=False):
     c_mask = tf.cast(x, tf.bool)
     batch_size = melt.get_shape(x, 0)
     c_len, max_c_len = melt.length2(x)
+    ori_c_len = c_len
 
     if FLAGS.rnn_no_padding:
       logging.info('------------------no padding! train or eval')
@@ -235,6 +239,10 @@ def call(self, input, training=False):
 
     x = self.pooling(x, c_len, calc_word_scores=self.debug)
 
+    if FLAGS.use_len:
+      len_emb = self.len_embedding(ori_c_len)
+      x = tf.concat([x, len_emb], -1)
+
     # not help much
     if self.dense is not None:
       x = self.dense(x)
@@ -578,6 +586,8 @@ def call(self, input, training=False):
 
     if FLAGS.encoder_output_method != 'last':
       x = self.pooling(x, c_len)
+      x2 = model.get_pooled_output()
+      x = tf.concat([x, x2], -1)
     x = self.logits(x)
     x = tf.reshape(x, [batch_size, NUM_ATTRIBUTES, NUM_CLASSES])
     return x
diff --git a/projects/ai2018/sentiment/analysis/class-info.py b/projects/ai2018/sentiment/analysis/class-info.py
@@ -38,8 +38,8 @@
   for i, label in enumerate(labels):
     counts[i][label + 2] += 1
 
-# for attr, count in zip(ATTRIBUTES, counts):
-#   print(attr, ['%.3f' % (x / len(df)) for x in count])
+for attr, count in zip(ATTRIBUTES, counts):
+  print('%-40s' % attr, ['%.5f' % (x / len(df)) for x in count])
 
 for attr, count in zip(ATTRIBUTES, counts):
-  print(attr, count)
+  print('%-40s' % attr, count)
diff --git a/projects/ai2018/sentiment/analysis/class-info.train.csv b/projects/ai2018/sentiment/analysis/class-info.train.csv
@@ -1,20 +1,40 @@
-location_traffic_convenience [0.7750666666666667, 0.012552380952380952, 0.009961904761904762, 0.2024190476190476]
-location_distance_from_business_district [0.796952380952381, 0.005580952380952381, 0.005076190476190476, 0.1923904761904762]
-location_easy_to_find [0.7676666666666667, 0.037866666666666667, 0.023542857142857143, 0.17092380952380953]
-service_wait_time [0.8834571428571428, 0.028895238095238095, 0.04173333333333333, 0.04591428571428571]
-service_waiters_attitude [0.4039047619047619, 0.0827047619047619, 0.11937142857142857, 0.3940190476190476]
-service_parking_convenience [0.9359619047619048, 0.0126, 0.013866666666666666, 0.03757142857142857]
-service_serving_speed [0.8447619047619047, 0.052257142857142856, 0.02265714285714286, 0.08032380952380952]
-price_level [0.5030476190476191, 0.11785714285714285, 0.23094285714285714, 0.14815238095238095]
-price_cost_effective [0.7642095238095238, 0.028676190476190477, 0.029257142857142857, 0.17785714285714285]
-price_discount [0.6118380952380953, 0.01634285714285714, 0.17385714285714285, 0.19796190476190476]
-environment_decoration [0.5134857142857143, 0.02037142857142857, 0.0904, 0.37574285714285716]
-environment_noise [0.6994761904761905, 0.029304761904761906, 0.046123809523809525, 0.2250952380952381]
-environment_space [0.6228380952380952, 0.054342857142857144, 0.08820952380952381, 0.23460952380952382]
-environment_cleaness [0.6342666666666666, 0.042980952380952384, 0.04479047619047619, 0.27796190476190474]
-dish_portion [0.5420666666666667, 0.0954095238095238, 0.09053333333333333, 0.27199047619047617]
-dish_taste [0.048285714285714286, 0.04155238095238095, 0.38285714285714284, 0.5273047619047619]
-dish_look [0.7235714285714285, 0.030266666666666667, 0.04452380952380952, 0.20163809523809523]
-dish_recommendation [0.8073047619047619, 0.021666666666666667, 0.018933333333333333, 0.1520952380952381]
-others_overall_experience [0.020095238095238097, 0.08937142857142857, 0.2232, 0.6673333333333333]
-others_willing_to_consume_again [0.6247619047619047, 0.03960952380952381, 0.027742857142857142, 0.30788571428571426]
+location_traffic_convenience             ['0.77507', '0.01255', '0.00996', '0.20242']
+location_distance_from_business_district ['0.79695', '0.00558', '0.00508', '0.19239']
+location_easy_to_find                    ['0.76767', '0.03787', '0.02354', '0.17092']
+service_wait_time                        ['0.88346', '0.02890', '0.04173', '0.04591']
+service_waiters_attitude                 ['0.40390', '0.08270', '0.11937', '0.39402']
+service_parking_convenience              ['0.93596', '0.01260', '0.01387', '0.03757']
+service_serving_speed                    ['0.84476', '0.05226', '0.02266', '0.08032']
+price_level                              ['0.50305', '0.11786', '0.23094', '0.14815']
+price_cost_effective                     ['0.76421', '0.02868', '0.02926', '0.17786']
+price_discount                           ['0.61184', '0.01634', '0.17386', '0.19796']
+environment_decoration                   ['0.51349', '0.02037', '0.09040', '0.37574']
+environment_noise                        ['0.69948', '0.02930', '0.04612', '0.22510']
+environment_space                        ['0.62284', '0.05434', '0.08821', '0.23461']
+environment_cleaness                     ['0.63427', '0.04298', '0.04479', '0.27796']
+dish_portion                             ['0.54207', '0.09541', '0.09053', '0.27199']
+dish_taste                               ['0.04829', '0.04155', '0.38286', '0.52730']
+dish_look                                ['0.72357', '0.03027', '0.04452', '0.20164']
+dish_recommendation                      ['0.80730', '0.02167', '0.01893', '0.15210']
+others_overall_experience                ['0.02010', '0.08937', '0.22320', '0.66733']
+others_willing_to_consume_again          ['0.62476', '0.03961', '0.02774', '0.30789']
+location_traffic_convenience             [81382  1318  1046 21254]
+location_distance_from_business_district [83680   586   533 20201]
+location_easy_to_find                    [80605  3976  2472 17947]
+service_wait_time                        [92763  3034  4382  4821]
+service_waiters_attitude                 [42410  8684 12534 41372]
+service_parking_convenience              [98276  1323  1456  3945]
+service_serving_speed                    [88700  5487  2379  8434]
+price_level                              [52820 12375 24249 15556]
+price_cost_effective                     [80242  3011  3072 18675]
+price_discount                           [64243  1716 18255 20786]
+environment_decoration                   [53916  2139  9492 39453]
+environment_noise                        [73445  3077  4843 23635]
+environment_space                        [65398  5706  9262 24634]
+environment_cleaness                     [66598  4513  4703 29186]
+dish_portion                             [56917 10018  9506 28559]
+dish_taste                               [ 5070  4363 40200 55367]
+dish_look                                [75975  3178  4675 21172]
+dish_recommendation                      [84767  2275  1988 15970]
+others_overall_experience                [ 2110  9384 23436 70070]
+others_willing_to_consume_again          [65600  4159  2913 32328]
diff --git a/projects/ai2018/sentiment/analysis/class-info.valid.csv b/projects/ai2018/sentiment/analysis/class-info.valid.csv
@@ -1,20 +1,40 @@
-location_traffic_convenience [0.7838, 0.012133333333333333, 0.009066666666666667, 0.195]
-location_distance_from_business_district [0.8021333333333334, 0.006, 0.005333333333333333, 0.18653333333333333]
-location_easy_to_find [0.7677333333333334, 0.0368, 0.021933333333333332, 0.17353333333333334]
-service_wait_time [0.8824666666666666, 0.031, 0.039933333333333335, 0.0466]
-service_waiters_attitude [0.3994666666666667, 0.0806, 0.122, 0.3979333333333333]
-service_parking_convenience [0.9364, 0.012533333333333334, 0.0136, 0.03746666666666667]
-service_serving_speed [0.8451333333333333, 0.05366666666666667, 0.024533333333333334, 0.07666666666666666]
-price_level [0.4998, 0.11733333333333333, 0.23433333333333334, 0.14853333333333332]
-price_cost_effective [0.7618666666666667, 0.029666666666666668, 0.026533333333333332, 0.18193333333333334]
-price_discount [0.6174666666666667, 0.017733333333333334, 0.1754, 0.1894]
-environment_decoration [0.5200666666666667, 0.018333333333333333, 0.08746666666666666, 0.3741333333333333]
-environment_noise [0.7014, 0.03206666666666667, 0.044333333333333336, 0.2222]
-environment_space [0.6327333333333334, 0.0516, 0.08733333333333333, 0.22833333333333333]
-environment_cleaness [0.6397333333333334, 0.041666666666666664, 0.0418, 0.2768]
-dish_portion [0.5408, 0.09613333333333333, 0.09366666666666666, 0.2694]
-dish_taste [0.0504, 0.03866666666666667, 0.388, 0.5229333333333334]
-dish_look [0.7172, 0.030333333333333334, 0.044066666666666664, 0.2084]
-dish_recommendation [0.8055333333333333, 0.022333333333333334, 0.019133333333333332, 0.153]
-others_overall_experience [0.019, 0.08553333333333334, 0.2238, 0.6716666666666666]
-others_willing_to_consume_again [0.6236, 0.03846666666666667, 0.026333333333333334, 0.3116]
+location_traffic_convenience             ['0.78380', '0.01213', '0.00907', '0.19500']
+location_distance_from_business_district ['0.80213', '0.00600', '0.00533', '0.18653']
+location_easy_to_find                    ['0.76773', '0.03680', '0.02193', '0.17353']
+service_wait_time                        ['0.88247', '0.03100', '0.03993', '0.04660']
+service_waiters_attitude                 ['0.39947', '0.08060', '0.12200', '0.39793']
+service_parking_convenience              ['0.93640', '0.01253', '0.01360', '0.03747']
+service_serving_speed                    ['0.84513', '0.05367', '0.02453', '0.07667']
+price_level                              ['0.49980', '0.11733', '0.23433', '0.14853']
+price_cost_effective                     ['0.76187', '0.02967', '0.02653', '0.18193']
+price_discount                           ['0.61747', '0.01773', '0.17540', '0.18940']
+environment_decoration                   ['0.52007', '0.01833', '0.08747', '0.37413']
+environment_noise                        ['0.70140', '0.03207', '0.04433', '0.22220']
+environment_space                        ['0.63273', '0.05160', '0.08733', '0.22833']
+environment_cleaness                     ['0.63973', '0.04167', '0.04180', '0.27680']
+dish_portion                             ['0.54080', '0.09613', '0.09367', '0.26940']
+dish_taste                               ['0.05040', '0.03867', '0.38800', '0.52293']
+dish_look                                ['0.71720', '0.03033', '0.04407', '0.20840']
+dish_recommendation                      ['0.80553', '0.02233', '0.01913', '0.15300']
+others_overall_experience                ['0.01900', '0.08553', '0.22380', '0.67167']
+others_willing_to_consume_again          ['0.62360', '0.03847', '0.02633', '0.31160']
+location_traffic_convenience             [11757   182   136  2925]
+location_distance_from_business_district [12032    90    80  2798]
+location_easy_to_find                    [11516   552   329  2603]
+service_wait_time                        [13237   465   599   699]
+service_waiters_attitude                 [5992 1209 1830 5969]
+service_parking_convenience              [14046   188   204   562]
+service_serving_speed                    [12677   805   368  1150]
+price_level                              [7497 1760 3515 2228]
+price_cost_effective                     [11428   445   398  2729]
+price_discount                           [9262  266 2631 2841]
+environment_decoration                   [7801  275 1312 5612]
+environment_noise                        [10521   481   665  3333]
+environment_space                        [9491  774 1310 3425]
+environment_cleaness                     [9596  625  627 4152]
+dish_portion                             [8112 1442 1405 4041]
+dish_taste                               [ 756  580 5820 7844]
+dish_look                                [10758   455   661  3126]
+dish_recommendation                      [12083   335   287  2295]
+others_overall_experience                [  285  1283  3357 10075]
+others_willing_to_consume_again          [9354  577  395 4674]
diff --git a/projects/ai2018/sentiment/analysis/diff.py b/projects/ai2018/sentiment/analysis/diff.py
@@ -67,9 +67,12 @@
 df = pd.read_csv(ifile)
 df = df.sort_values('id')
 
+print(ifile, len(df))
+
 df2 = pd.read_csv(ifile2)
 df2 = df2.sort_values('id')
 
+print(ifile2, len(df2))
 #df = df.iloc[[0]]
 #print(df)
 #df2 = df2.iloc[[0]]

diff --git a/projects/ai2018/sentiment/dataset.py b/projects/ai2018/sentiment/dataset.py
@@ -29,7 +29,7 @@
 logging = melt.logging
 from wenzheng.utils import vocabulary
 
-from algos.config import NUM_ATTRIBUTES
+from algos.config import NUM_ATTRIBUTES, NUM_CLASSES
 import prepare.config
 
 class Dataset(melt.tfrecords.Dataset):
@@ -64,27 +64,38 @@ def parser(self, example):
       'pos': tf.VarLenFeature(tf.int64),
       'ner': tf.VarLenFeature(tf.int64),
       'wlen': tf.VarLenFeature(tf.int64),
-      'label': tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64),
+      #'label': tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64),
       'source':  tf.FixedLenFeature([], tf.string),
       }
 
+    if FLAGS.use_soft_label:
+      features_dict['label'] =  tf.FixedLenFeature([NUM_ATTRIBUTES * NUM_CLASSES], tf.float32)
+    else:
+      features_dict['label'] =  tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64)
+
     #if FLAGS.use_char:
     #features_dict['chars'] = tf.VarLenFeature(tf.int64)
 
     features = tf.parse_single_example(example, features=features_dict)
 
     content = features['content']
     content = melt.sparse_tensor_to_dense(content)
+    # Actually not use below, for bert now use nbert tfrecords which is [first_n and last_m] so do not need content_limt 512 here
     if FLAGS.content_limit:
+      # TODO now only condider bert.. whey content[0] or content[:0] content[-1] not work ? FIXME..
+      start_id = vocabulary.start_id() if not FLAGS.model == 'Transformer' else 101
+      end_id = vocabulary.end_id() if not FLAGS.model == 'Transformer' else 102
+      # TODO now has problem ... one additional end or start...
       if not FLAGS.cut_front:
-        content = content[:FLAGS.content_limit]
+        content = tf.concat([content[:FLAGS.content_limit - 1], tf.constant([end_id], dtype=tf.int64)], 0)
       else:
-        content = content[-FLAGS.content_limit:]
+        content = tf.concat([tf.constant([start_id], dtype=tf.int64), content[-FLAGS.content_limit + 1:]], 0)
     # if FLAGS.add_start_end:
     #   content = tf.concat([tf.constant([vocabulary.start_id()], dtype=tf.int64), content, tf.constant([vocabulary.end_id()], dtype=tf.int64)], 0)
     # NOTICE! not work in dataset... so put to later step like in call but should do the same thing again for pytorch..
+    ## TODO can use below to do unk aug so not to have different code for tf and pytorch later
     # if FLAGS.vocab_min_count:
-    #   content = melt.greater_then_set(content, FLAGS.vocab_min_count, UNK_ID)
+    # #   content = melt.greater_then_set(content, FLAGS.vocab_min_count, UNK_ID)
 
     features['content'] = content
     label = features['label']
@@ -113,9 +124,12 @@ def parser(self, example):
     features['wlen'] = wlen
 
     x = features
-    y = label + 2
-    if FLAGS.binary_class_index is not None:
-      y = tf.to_int64(tf.equal(y, FLAGS.binary_class_index))
+    if not FLAGS.use_soft_label:
+      y = label + 2
+      if FLAGS.binary_class_index is not None:
+        y = tf.to_int64(tf.equal(y, FLAGS.binary_class_index))
+    else:
+      y = label
 
     return x, y
 

diff --git a/projects/ai2018/sentiment/ensemble/README.md b/projects/ai2018/sentiment/ensemble/README.md
@@ -0,0 +1,3 @@
+just use ./ensemble-cv.py 
+python ./ensemble-cv.py 
+python ./ensemble-cv.py --debug=0