Skip to content

Commit

Permalink
got 72946 on sentiment testb with 31 rnn models + 2 bert models
Browse files Browse the repository at this point in the history
  • Loading branch information
chenghuige committed Nov 16, 2018
1 parent 2a797ef commit 4a5ca4d
Show file tree
Hide file tree
Showing 280 changed files with 10,822 additions and 298 deletions.
3 changes: 3 additions & 0 deletions projects/ai2018/sentiment/algos/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
# -2,-1,0,1 -> 0,1,2,3
NUM_CLASSES = 4

flags.DEFINE_bool('use_soft_label', False, '')
flags.DEFINE_bool('use_len', False, 'wether add length as a feature')

flags.DEFINE_string('pretrain_encoder', 'bilm', 'bilm or bert')
flags.DEFINE_bool('transformer_add_rnn', False, '')

Expand Down
10 changes: 10 additions & 0 deletions projects/ai2018/sentiment/algos/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ def __init__(self, embedding=None, lm_model=False, use_text_encoder=True):
else:
self.dense = None

if FLAGS.use_len:
self.len_embedding = wenzheng.Embedding(3000, 32)

self.num_classes = NUM_CLASSES if FLAGS.binary_class_index is None else 1
if FLAGS.loss_type == 'regression':
self.num_classes = 1
Expand Down Expand Up @@ -203,6 +206,7 @@ def call(self, input, training=False):
c_mask = tf.cast(x, tf.bool)
batch_size = melt.get_shape(x, 0)
c_len, max_c_len = melt.length2(x)
ori_c_len = c_len

if FLAGS.rnn_no_padding:
logging.info('------------------no padding! train or eval')
Expand Down Expand Up @@ -235,6 +239,10 @@ def call(self, input, training=False):

x = self.pooling(x, c_len, calc_word_scores=self.debug)

if FLAGS.use_len:
len_emb = self.len_embedding(ori_c_len)
x = tf.concat([x, len_emb], -1)

# not help much
if self.dense is not None:
x = self.dense(x)
Expand Down Expand Up @@ -578,6 +586,8 @@ def call(self, input, training=False):

if FLAGS.encoder_output_method != 'last':
x = self.pooling(x, c_len)
x2 = model.get_pooled_output()
x = tf.concat([x, x2], -1)
x = self.logits(x)
x = tf.reshape(x, [batch_size, NUM_ATTRIBUTES, NUM_CLASSES])
return x
6 changes: 3 additions & 3 deletions projects/ai2018/sentiment/analysis/class-info.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
for i, label in enumerate(labels):
counts[i][label + 2] += 1

# for attr, count in zip(ATTRIBUTES, counts):
# print(attr, ['%.3f' % (x / len(df)) for x in count])
for attr, count in zip(ATTRIBUTES, counts):
print('%-40s' % attr, ['%.5f' % (x / len(df)) for x in count])

for attr, count in zip(ATTRIBUTES, counts):
print(attr, count)
print('%-40s' % attr, count)
60 changes: 40 additions & 20 deletions projects/ai2018/sentiment/analysis/class-info.train.csv
Original file line number Diff line number Diff line change
@@ -1,20 +1,40 @@
location_traffic_convenience [0.7750666666666667, 0.012552380952380952, 0.009961904761904762, 0.2024190476190476]
location_distance_from_business_district [0.796952380952381, 0.005580952380952381, 0.005076190476190476, 0.1923904761904762]
location_easy_to_find [0.7676666666666667, 0.037866666666666667, 0.023542857142857143, 0.17092380952380953]
service_wait_time [0.8834571428571428, 0.028895238095238095, 0.04173333333333333, 0.04591428571428571]
service_waiters_attitude [0.4039047619047619, 0.0827047619047619, 0.11937142857142857, 0.3940190476190476]
service_parking_convenience [0.9359619047619048, 0.0126, 0.013866666666666666, 0.03757142857142857]
service_serving_speed [0.8447619047619047, 0.052257142857142856, 0.02265714285714286, 0.08032380952380952]
price_level [0.5030476190476191, 0.11785714285714285, 0.23094285714285714, 0.14815238095238095]
price_cost_effective [0.7642095238095238, 0.028676190476190477, 0.029257142857142857, 0.17785714285714285]
price_discount [0.6118380952380953, 0.01634285714285714, 0.17385714285714285, 0.19796190476190476]
environment_decoration [0.5134857142857143, 0.02037142857142857, 0.0904, 0.37574285714285716]
environment_noise [0.6994761904761905, 0.029304761904761906, 0.046123809523809525, 0.2250952380952381]
environment_space [0.6228380952380952, 0.054342857142857144, 0.08820952380952381, 0.23460952380952382]
environment_cleaness [0.6342666666666666, 0.042980952380952384, 0.04479047619047619, 0.27796190476190474]
dish_portion [0.5420666666666667, 0.0954095238095238, 0.09053333333333333, 0.27199047619047617]
dish_taste [0.048285714285714286, 0.04155238095238095, 0.38285714285714284, 0.5273047619047619]
dish_look [0.7235714285714285, 0.030266666666666667, 0.04452380952380952, 0.20163809523809523]
dish_recommendation [0.8073047619047619, 0.021666666666666667, 0.018933333333333333, 0.1520952380952381]
others_overall_experience [0.020095238095238097, 0.08937142857142857, 0.2232, 0.6673333333333333]
others_willing_to_consume_again [0.6247619047619047, 0.03960952380952381, 0.027742857142857142, 0.30788571428571426]
location_traffic_convenience ['0.77507', '0.01255', '0.00996', '0.20242']
location_distance_from_business_district ['0.79695', '0.00558', '0.00508', '0.19239']
location_easy_to_find ['0.76767', '0.03787', '0.02354', '0.17092']
service_wait_time ['0.88346', '0.02890', '0.04173', '0.04591']
service_waiters_attitude ['0.40390', '0.08270', '0.11937', '0.39402']
service_parking_convenience ['0.93596', '0.01260', '0.01387', '0.03757']
service_serving_speed ['0.84476', '0.05226', '0.02266', '0.08032']
price_level ['0.50305', '0.11786', '0.23094', '0.14815']
price_cost_effective ['0.76421', '0.02868', '0.02926', '0.17786']
price_discount ['0.61184', '0.01634', '0.17386', '0.19796']
environment_decoration ['0.51349', '0.02037', '0.09040', '0.37574']
environment_noise ['0.69948', '0.02930', '0.04612', '0.22510']
environment_space ['0.62284', '0.05434', '0.08821', '0.23461']
environment_cleaness ['0.63427', '0.04298', '0.04479', '0.27796']
dish_portion ['0.54207', '0.09541', '0.09053', '0.27199']
dish_taste ['0.04829', '0.04155', '0.38286', '0.52730']
dish_look ['0.72357', '0.03027', '0.04452', '0.20164']
dish_recommendation ['0.80730', '0.02167', '0.01893', '0.15210']
others_overall_experience ['0.02010', '0.08937', '0.22320', '0.66733']
others_willing_to_consume_again ['0.62476', '0.03961', '0.02774', '0.30789']
location_traffic_convenience [81382 1318 1046 21254]
location_distance_from_business_district [83680 586 533 20201]
location_easy_to_find [80605 3976 2472 17947]
service_wait_time [92763 3034 4382 4821]
service_waiters_attitude [42410 8684 12534 41372]
service_parking_convenience [98276 1323 1456 3945]
service_serving_speed [88700 5487 2379 8434]
price_level [52820 12375 24249 15556]
price_cost_effective [80242 3011 3072 18675]
price_discount [64243 1716 18255 20786]
environment_decoration [53916 2139 9492 39453]
environment_noise [73445 3077 4843 23635]
environment_space [65398 5706 9262 24634]
environment_cleaness [66598 4513 4703 29186]
dish_portion [56917 10018 9506 28559]
dish_taste [ 5070 4363 40200 55367]
dish_look [75975 3178 4675 21172]
dish_recommendation [84767 2275 1988 15970]
others_overall_experience [ 2110 9384 23436 70070]
others_willing_to_consume_again [65600 4159 2913 32328]
60 changes: 40 additions & 20 deletions projects/ai2018/sentiment/analysis/class-info.valid.csv
Original file line number Diff line number Diff line change
@@ -1,20 +1,40 @@
location_traffic_convenience [0.7838, 0.012133333333333333, 0.009066666666666667, 0.195]
location_distance_from_business_district [0.8021333333333334, 0.006, 0.005333333333333333, 0.18653333333333333]
location_easy_to_find [0.7677333333333334, 0.0368, 0.021933333333333332, 0.17353333333333334]
service_wait_time [0.8824666666666666, 0.031, 0.039933333333333335, 0.0466]
service_waiters_attitude [0.3994666666666667, 0.0806, 0.122, 0.3979333333333333]
service_parking_convenience [0.9364, 0.012533333333333334, 0.0136, 0.03746666666666667]
service_serving_speed [0.8451333333333333, 0.05366666666666667, 0.024533333333333334, 0.07666666666666666]
price_level [0.4998, 0.11733333333333333, 0.23433333333333334, 0.14853333333333332]
price_cost_effective [0.7618666666666667, 0.029666666666666668, 0.026533333333333332, 0.18193333333333334]
price_discount [0.6174666666666667, 0.017733333333333334, 0.1754, 0.1894]
environment_decoration [0.5200666666666667, 0.018333333333333333, 0.08746666666666666, 0.3741333333333333]
environment_noise [0.7014, 0.03206666666666667, 0.044333333333333336, 0.2222]
environment_space [0.6327333333333334, 0.0516, 0.08733333333333333, 0.22833333333333333]
environment_cleaness [0.6397333333333334, 0.041666666666666664, 0.0418, 0.2768]
dish_portion [0.5408, 0.09613333333333333, 0.09366666666666666, 0.2694]
dish_taste [0.0504, 0.03866666666666667, 0.388, 0.5229333333333334]
dish_look [0.7172, 0.030333333333333334, 0.044066666666666664, 0.2084]
dish_recommendation [0.8055333333333333, 0.022333333333333334, 0.019133333333333332, 0.153]
others_overall_experience [0.019, 0.08553333333333334, 0.2238, 0.6716666666666666]
others_willing_to_consume_again [0.6236, 0.03846666666666667, 0.026333333333333334, 0.3116]
location_traffic_convenience ['0.78380', '0.01213', '0.00907', '0.19500']
location_distance_from_business_district ['0.80213', '0.00600', '0.00533', '0.18653']
location_easy_to_find ['0.76773', '0.03680', '0.02193', '0.17353']
service_wait_time ['0.88247', '0.03100', '0.03993', '0.04660']
service_waiters_attitude ['0.39947', '0.08060', '0.12200', '0.39793']
service_parking_convenience ['0.93640', '0.01253', '0.01360', '0.03747']
service_serving_speed ['0.84513', '0.05367', '0.02453', '0.07667']
price_level ['0.49980', '0.11733', '0.23433', '0.14853']
price_cost_effective ['0.76187', '0.02967', '0.02653', '0.18193']
price_discount ['0.61747', '0.01773', '0.17540', '0.18940']
environment_decoration ['0.52007', '0.01833', '0.08747', '0.37413']
environment_noise ['0.70140', '0.03207', '0.04433', '0.22220']
environment_space ['0.63273', '0.05160', '0.08733', '0.22833']
environment_cleaness ['0.63973', '0.04167', '0.04180', '0.27680']
dish_portion ['0.54080', '0.09613', '0.09367', '0.26940']
dish_taste ['0.05040', '0.03867', '0.38800', '0.52293']
dish_look ['0.71720', '0.03033', '0.04407', '0.20840']
dish_recommendation ['0.80553', '0.02233', '0.01913', '0.15300']
others_overall_experience ['0.01900', '0.08553', '0.22380', '0.67167']
others_willing_to_consume_again ['0.62360', '0.03847', '0.02633', '0.31160']
location_traffic_convenience [11757 182 136 2925]
location_distance_from_business_district [12032 90 80 2798]
location_easy_to_find [11516 552 329 2603]
service_wait_time [13237 465 599 699]
service_waiters_attitude [5992 1209 1830 5969]
service_parking_convenience [14046 188 204 562]
service_serving_speed [12677 805 368 1150]
price_level [7497 1760 3515 2228]
price_cost_effective [11428 445 398 2729]
price_discount [9262 266 2631 2841]
environment_decoration [7801 275 1312 5612]
environment_noise [10521 481 665 3333]
environment_space [9491 774 1310 3425]
environment_cleaness [9596 625 627 4152]
dish_portion [8112 1442 1405 4041]
dish_taste [ 756 580 5820 7844]
dish_look [10758 455 661 3126]
dish_recommendation [12083 335 287 2295]
others_overall_experience [ 285 1283 3357 10075]
others_willing_to_consume_again [9354 577 395 4674]
3 changes: 3 additions & 0 deletions projects/ai2018/sentiment/analysis/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,12 @@
df = pd.read_csv(ifile)
df = df.sort_values('id')

print(ifile, len(df))

df2 = pd.read_csv(ifile2)
df2 = df2.sort_values('id')

print(ifile2, len(df2))
#df = df.iloc[[0]]
#print(df)
#df2 = df2.iloc[[0]]
Expand Down
30 changes: 22 additions & 8 deletions projects/ai2018/sentiment/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
logging = melt.logging
from wenzheng.utils import vocabulary

from algos.config import NUM_ATTRIBUTES
from algos.config import NUM_ATTRIBUTES, NUM_CLASSES
import prepare.config

class Dataset(melt.tfrecords.Dataset):
Expand Down Expand Up @@ -64,27 +64,38 @@ def parser(self, example):
'pos': tf.VarLenFeature(tf.int64),
'ner': tf.VarLenFeature(tf.int64),
'wlen': tf.VarLenFeature(tf.int64),
'label': tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64),
#'label': tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64),
'source': tf.FixedLenFeature([], tf.string),
}

if FLAGS.use_soft_label:
features_dict['label'] = tf.FixedLenFeature([NUM_ATTRIBUTES * NUM_CLASSES], tf.float32)
else:
features_dict['label'] = tf.FixedLenFeature([NUM_ATTRIBUTES], tf.int64)

#if FLAGS.use_char:
#features_dict['chars'] = tf.VarLenFeature(tf.int64)

features = tf.parse_single_example(example, features=features_dict)

content = features['content']
content = melt.sparse_tensor_to_dense(content)
# Actually not use below, for bert now use nbert tfrecords which is [first_n and last_m] so do not need content_limt 512 here
if FLAGS.content_limit:
# TODO now only condider bert.. whey content[0] or content[:0] content[-1] not work ? FIXME..
start_id = vocabulary.start_id() if not FLAGS.model == 'Transformer' else 101
end_id = vocabulary.end_id() if not FLAGS.model == 'Transformer' else 102
# TODO now has problem ... one additional end or start...
if not FLAGS.cut_front:
content = content[:FLAGS.content_limit]
content = tf.concat([content[:FLAGS.content_limit - 1], tf.constant([end_id], dtype=tf.int64)], 0)
else:
content = content[-FLAGS.content_limit:]
content = tf.concat([tf.constant([start_id], dtype=tf.int64), content[-FLAGS.content_limit + 1:]], 0)
# if FLAGS.add_start_end:
# content = tf.concat([tf.constant([vocabulary.start_id()], dtype=tf.int64), content, tf.constant([vocabulary.end_id()], dtype=tf.int64)], 0)
# NOTICE! not work in dataset... so put to later step like in call but should do the same thing again for pytorch..
## TODO can use below to do unk aug so not to have different code for tf and pytorch later
# if FLAGS.vocab_min_count:
# content = melt.greater_then_set(content, FLAGS.vocab_min_count, UNK_ID)
# # content = melt.greater_then_set(content, FLAGS.vocab_min_count, UNK_ID)

features['content'] = content
label = features['label']
Expand Down Expand Up @@ -113,9 +124,12 @@ def parser(self, example):
features['wlen'] = wlen

x = features
y = label + 2
if FLAGS.binary_class_index is not None:
y = tf.to_int64(tf.equal(y, FLAGS.binary_class_index))
if not FLAGS.use_soft_label:
y = label + 2
if FLAGS.binary_class_index is not None:
y = tf.to_int64(tf.equal(y, FLAGS.binary_class_index))
else:
y = label

return x, y

Expand Down
3 changes: 3 additions & 0 deletions projects/ai2018/sentiment/ensemble/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
just use ./ensemble-cv.py
python ./ensemble-cv.py
python ./ensemble-cv.py --debug=0
Loading

0 comments on commit 4a5ca4d

Please sign in to comment.