-
Notifications
You must be signed in to change notification settings - Fork 0
/
MIMIC_SVD_Vectorizer.py
56 lines (43 loc) · 1.36 KB
/
MIMIC_SVD_Vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import preprocess
import yaml
import argparse
global args
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
parser = argparse.ArgumentParser(description='SVD Representation')
parser.add_argument('--config', default='config.yml')
args = parser.parse_args()
NGRAM_RANGE = (1, 1) # use unigrams for cuis
MIN_DF = 0.0
# Load yaml configs into configs dictionary
with open(args.config, 'r') as f:
configs = yaml.safe_load(f)
f.close()
if __name__ == "__main__":
big_query = configs['data']['bq']
dataset = preprocess.DatasetProvider(
configs['data']['notes'],
configs['data']['codes'],
configs['args']['min_token_freq'],
configs['args']['max_tokens_in_file'],
configs['args']['min_examples_per_code'],
big_query
)
x_train, y_train = dataset.load_raw()
tf = TfidfVectorizer(
ngram_range=NGRAM_RANGE,
stop_words='english',
min_df=MIN_DF,
vocabulary=None,
use_idf=True
)
train_tfidf_matrix = tf.fit_transform(x_train)
pickle_tfidf = open("tfidf.pkl", 'wb')
pickle.dump(tf, pickle_tfidf)
pickle_tfidf.close()
svd = TruncatedSVD(n_components=1000)
svd.fit(train_tfidf_matrix)
pickle_svd = open("svd.pkl", 'wb')
pickle.dump(svd, pickle_svd)
pickle_svd.close()