/
kronos_offline_training.py
75 lines (63 loc) · 2.95 KB
/
kronos_offline_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""Script to start offline training of Kronos."""
from analytics_platform.kronos.gnosis.src.offline_training import (
generate_and_save_gnosis_package_topic_model_s3, train_and_save_gnosis_ref_arch_s3)
from analytics_platform.kronos.softnet.src.offline_training import (
generate_and_save_kronos_dependency_s3, generate_and_save_cooccurrence_matrices_s3)
from analytics_platform.kronos.pgm.src.offline_training import train_and_save_kronos_list_s3
from analytics_platform.kronos.apollo.src.offline_training import (
train_and_save_pruned_tag_list_s3,
generate_and_save_package_frequency_dict_s3)
import sys
import time
import daiquiri
import logging
daiquiri.setup(level=logging.INFO)
_logger = daiquiri.getLogger(__name__)
if __name__ == '__main__':
if len(sys.argv) < 2:
training_data_url = "s3://dev-stack-analysis-clean-data/maven/github/"
fp_min_support_count = 45
fp_intent_topic_count_threshold = 3
fp_num_partition = 12
_logger.info("No env provided, using default")
else:
training_data_url = sys.argv[1]
fp_min_support_count = int(sys.argv[2])
fp_intent_topic_count_threshold = int(sys.argv[3])
fp_num_partition = int(sys.argv[4])
_logger.info("Env Provided")
_logger.info("S3 URL : {}".format(training_data_url))
t0 = time.time()
_logger.info("Tag List Preprocess started")
train_and_save_pruned_tag_list_s3(training_data_url=training_data_url)
_logger.info(
"tag List Preprocessing Ended in {} seconds".format(time.time() - t0))
t0 = time.time()
_logger.info("Frequency dict generation started")
generate_and_save_package_frequency_dict_s3(training_data_url=training_data_url)
_logger.info(
"Frequency dict Preprocessing Ended in {} seconds".format(time.time() - t0))
t0 = time.time()
_logger.info("Gnosis Training Started")
generate_and_save_gnosis_package_topic_model_s3(
training_data_url=training_data_url)
train_and_save_gnosis_ref_arch_s3(
training_data_url=training_data_url,
fp_min_support_count=fp_min_support_count,
fp_intent_topic_count_threshold=fp_intent_topic_count_threshold,
fp_num_partition=fp_num_partition)
_logger.info("Gnosis Training Ended in {} seconds".format(time.time() - t0))
t0 = time.time()
_logger.info("Softnet Training Started")
generate_and_save_kronos_dependency_s3(training_data_url=training_data_url)
_logger.info(
"Dependency graph Training Ended in {} seconds".format(time.time() - t0))
t0 = time.time()
generate_and_save_cooccurrence_matrices_s3(
training_data_url=training_data_url)
_logger.info(
"Co-occurence matrix Training Ended in {} seconds".format(time.time() - t0))
t0 = time.time()
_logger.info("Kronos Training Started")
train_and_save_kronos_list_s3(training_data_url=training_data_url)
_logger.info("Kronos Training Ended in {} seconds".format(time.time() - t0))