-
Notifications
You must be signed in to change notification settings - Fork 1
/
raw_train_5.py
160 lines (126 loc) · 5.24 KB
/
raw_train_5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Quadruplets
# training file , I remove the concept of validation loss
# In this script we perform the training of the fully connected model
# Import
import gensim
import keras
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential, load_model
from keras.optimizers import Adam
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import pyrouge
from pyrouge import Rouge155
from hard_coded import data_json_dir, data_txt_dir, lang_model_dir, model_dir, nn_summarizers_dir, summary_system_super_dir, tdqfs_folder
from hard_coded import non_selected_keys,tdqfs_themes
from functions.training_functions import *
# paths to folder
data_json = data_json_dir
data_txt = data_txt_dir
lang_model_folder = lang_model_dir
nn_summarizers_folder = nn_summarizers_dir
summary_system_super_folder = summary_system_super_dir
themes = tdqfs_themes
#title_file = "/home/ubuntu/summarization_query_oriented/data/DUC/duc2005_topics.sgml"
#titles_folder = "/home/ubuntu/summarization_query_oriented/data/DUC/duc2005_docs/"
# training parameters
patience_limit = 25
## loading a d2vmodel (to be a shifted LSTM next ...)
# parameters of doc2vec
dm = 0
min_count = 5
window = 10
size = 400
sample = 1e-4
negative = 5
workers = 4
epoch = 100
# Initialize the model ( IMPORTANT )
d2v_model = gensim.models.doc2vec.Doc2Vec(dm=dm,min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers,iter = epoch)
# load model
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)+"_wosw"
try :
d2v_model = d2v_model.load(lang_model_folder+model_name+".d2v")
except :
print "try a model in : ", os.listdir(lang_model_folder)
print("model loaded")
## get wikipedia data
article_names, article_weights = relevant_articles(data_json)
## design a fully connected model
fc_model = Sequential()
fc_model.add(Dense(160, input_dim=1600))
fc_model.add(Activation('sigmoid'))
fc_model.add(Dropout(0.5))
fc_model.add(Dense(16))
fc_model.add(Activation('sigmoid'))
fc_model.add(Dropout(0.5))
fc_model.add(Dense(1))
fc_model.add(Activation('sigmoid'))
# compiling the model
fc_model.compile(loss="binary_crossentropy", optimizer='adam')
# training per batch
batch_per_epoch = 50
#batch_per_epoch = 20
batch_size = 128
patience = 0
batch_counter = 6000
rouge_su4_recall_max = 0
rouge_2_recall_max = 0
while patience < patience_limit :
# train on several batchs
for i in range(batch_per_epoch):
triplets, labels = create_triplets(d2v_model, article_names, article_weights, nb_triplets=batch_size, triplets_per_file=16, neg_ratio=1, str_mode = False, with_txt_vect=True) # HERE IS THE NEW STUFF
fc_model.train_on_batch(triplets, labels)
batch_counter += 1
# summarize DUC
str_time = time.strftime("%Y_%m_%d")
fc_model_name = "T5"+str_time+"_fc_model_batch_"+str(batch_counter)+"k"
system_folder = summary_system_super_folder+fc_model_name+"/"
os.mkdir(system_folder)
for theme in themes :
theme_folder = tdqfs_folder + theme + "/"
theme_doc_folder = theme_folder + theme + "/"
queries = get_queries(theme_folder+"queries.txt")
text = merge_articles_tqdfs(theme_doc_folder)
for i in range(len(queries)):
query = queries[i]
summary = summarize(text,query,d2v_model, fc_model, limit = 250, with_txt_vect=True)
summary = " ".join(summary.split()[:250])
summary_name = theme + "." + str(i+1) + ".txt"
with open(system_folder + summary_name,'w') as f :
f.write(summary.decode('ascii',"ignore").encode("utf8", "replace"))
print 'writing in '+ system_folder + summary_name
r = Rouge155()
r.system_dir = system_folder
r.model_dir = model_dir
r.model_filename_pattern = '#ID#.u[0-9]q[0-9].txt'
r.system_filename_pattern = '([a-z]+.[0-9]+).txt'
options = "-e " + r._data_dir + " -n 4 -2 4 -u -c 95 -r 1000 -f A -p 0.5 -t 0 -a -x"
output = r.convert_and_evaluate(rouge_args=options)
output_dict = r.output_to_dict(output)
rouge_2_recall = np.round( output_dict["rouge_2_recall"], 5 )
rouge_su4_recall = np.round( output_dict["rouge_su4_recall"], 5 )
print 50*'$'
print "rouge_2", rouge_2_recall
print "rouge_SU4", rouge_su4_recall
print 50*'$'
# save rouge results
output_dict = r.output_to_dict(output)
with open(system_folder+"ROUGE_RESULTS.json",'w') as f :
json.dump(output_dict,f)
#check if the model has improved
if rouge_2_recall > rouge_2_recall_max or rouge_su4_recall > rouge_su4_recall_max :
patience = 0
if rouge_2_recall > rouge_2_recall_max :
rouge_2_recall_max = rouge_2_recall
if rouge_su4_recall > rouge_su4_recall_max :
rouge_su4_recall_max = rouge_su4_recall
#save this new model
fc_model_name ="fc_model_batch_"+str(batch_counter)+"k_R2_"+str(rouge_2_recall)+"_SU4_"+str(rouge_su4_recall)
fc_model.save(nn_summarizers_folder + fc_model_name + ".hdf5") # creates a HDF5 file 'my_model.h5'
print fc_model_name, "is saved"
else :
patience = patience + 1
print "patience :", patience
print('early stopped')