Skip to content
Permalink
Browse files

Improve ReadMe

  • Loading branch information...
Phil
Phil committed Mar 3, 2019
1 parent 7dc3431 commit 55c5cbc8f38e1c3c74af27af22fb51bc17848ece
Showing with 65 additions and 47,546 deletions.
  1. +40 −56 ReadMe.md
  2. +0 −47,473 analytics/data/ProfileStraggler/Stacktrace.json
  3. +3 −3 analytics/plot_application.py
  4. +5 −5 analytics/plot_fatso.py
  5. +3 −3 analytics/plot_straggler.py
  6. +2 −0 fold_stacks.py
  7. +2 −0 helper.py
  8. +8 −6 parsers.py
  9. +2 −0 pyspark_profilers.py

Large diffs are not rendered by default.

This file was deleted.

@@ -1,6 +1,6 @@
from plotly.offline import plot
from typing import List
from parsers import AppParser
from parsers import AppParser, SparkLogParser
from helper import get_max_y
from plotly.graph_objs import Figure, Scatter

@@ -29,10 +29,10 @@
app_parser = AppParser(application_path)
data_points: List[Scatter] = list()

executor_logs = app_parser.get_executor_logparsers()
executor_logs: List[SparkLogParser] = app_parser.get_executor_logparsers()
for parser in executor_logs:
# print(parser.get_available_metrics()) # ['epochMillis', 'ScavengeCollTime', 'MarkSweepCollTime', 'MarkSweepCollCount', 'ScavengeCollCount', 'systemCpuLoad', 'processCpuLoad', 'nonHeapMemoryTotalUsed', 'nonHeapMemoryCommitted', 'heapMemoryTotalUsed', 'heapMemoryCommitted']
relevant_metric = parser.get_metrics(['heapMemoryTotalUsed'])
relevant_metric: List[Scatter] = parser.get_metrics(['heapMemoryTotalUsed'])
data_points.extend(relevant_metric)

max_y = get_max_y(data_points) # static method, maximum y value needed for cosmetic reasons, scaling tasks
@@ -1,19 +1,19 @@
from plotly.offline import plot
from parsers import ProfileParser, SparkLogParser
from plotly.graph_objs import Figure

from plotly.graph_objs import Figure, Scatter
from typing import List

# The two files used below are created by running
# ~/spark-2.4.0-bin-hadoop2.7/bin/spark-submit --class profile.sparkjobs.JobFatso --conf spark.driver.extraJavaOptions=-javaagent:/Users/phil/jvm-profiler/target/jvm-profiler-1.0.0.jar=sampleInterval=100,metricInterval=100,reporter=com.uber.profiling.reporters.FileOutputReporter,outputDir=./ProfileFatso target/scala-2.11/philstopwatch-assembly-0.1.jar > JobFatso.log

profile_file = './data/ProfileFatso/CpuAndMemoryFatso.json.gz' # Output from JVM profiler
profile_parser = ProfileParser(profile_file, normalize=True)
data_points = profile_parser.make_graph()
data_points: List[Scatter] = profile_parser.make_graph()


logfile = './data/ProfileFatso/JobFatso.log.gz'
logfile = './data/ProfileFatso/JobFatso.log.gz' # standard Spark logs
log_parser = SparkLogParser(logfile)
stage_interval_markers = log_parser.extract_stage_markers()
stage_interval_markers: Scatter = log_parser.extract_stage_markers()
data_points.append(stage_interval_markers)

layout = log_parser.extract_job_markers(700)
@@ -1,13 +1,13 @@
from plotly.offline import plot
from plotly.graph_objs import Figure
from plotly.graph_objs import Figure, Scatter
from parsers import ProfileParser, SparkLogParser
from helper import get_max_y

from typing import List

profile_file = './data/ProfileStraggler/CpuAndMemory.json.gz' # Output from JVM profiler
profile_parser = ProfileParser(profile_file, normalize=True)
# data_points = profile_parser.ignore_metrics(['ScavengeCollCount'])
data_points = profile_parser.get_metrics(['systemCpuLoad', 'processCpuLoad'])
data_points: List[Scatter] = profile_parser.get_metrics(['systemCpuLoad', 'processCpuLoad'])

log_file = './data/ProfileStraggler/JobStraggler.log.gz' # standard Spark log
log_parser = SparkLogParser(log_file)
@@ -7,3 +7,5 @@
StackParser.convert_file(argv[1])
else:
StackParser.convert_files(argv[1:])

# Made at https://github.com/g1thubhub/phil_stopwatch by writingphil@gmail.com
@@ -211,3 +211,5 @@ def fat_function_inner(i):
def secondsSleep(i):
time.sleep(1)
return i

# Made at https://github.com/g1thubhub/phil_stopwatch by writingphil@gmail.com
@@ -146,7 +146,7 @@ def open_stream(self):
else:
return open(self.filename, 'r')

def get_available_metrics(self, id=''):
def get_available_metrics(self, id='') -> List[str]:
if len(self.data_points) == 0:
self.parse_profiles(id)
return list(self.relevant_metrics.keys())
@@ -241,7 +241,7 @@ class SparkLogParser:
r_spark_log = r'.* (?:error|info|warning)(.*)'
log_types = ['error', 'info', 'warn']

def __init__(self, log_file, profile_file='', id=''):
def __init__(self, log_file, profile_file='', id='', normalize=True):
if log_file != '':
self.logfile = log_file
self.time_pattern, self.re_time_pattern, self.re_app_start, self.re_job_start, self.re_job_end = None, None, None, None, None
@@ -253,9 +253,9 @@ def __init__(self, log_file, profile_file='', id=''):
self.job_intervals = dict()
self.identify_timeformat()
if profile_file is '':
self.profile_parser = ProfileParser(self.logfile)
self.profile_parser = ProfileParser(self.logfile, normalize)
else:
self.profile_parser = ProfileParser(profile_file)
self.profile_parser = ProfileParser(profile_file, normalize)
self.id = id

def open_stream(self):
@@ -264,7 +264,7 @@ def open_stream(self):
else:
return open(self.logfile, 'r')

def get_available_metrics(self):
def get_available_metrics(self) -> List[str]:
return self.profile_parser.get_available_metrics()

def identify_timeformat(self):
@@ -857,4 +857,6 @@ def extract_job_markers(self, max=10):

if __name__ == '__main__':
log_path = '/Users/a/logs/application_1550152404841_0001'
app_parser = AppParser(log_path)
app_parser = AppParser(log_path)

# Made at https://github.com/g1thubhub/phil_stopwatch by writingphil@gmail.com
@@ -330,3 +330,5 @@ def addInPlace(self, pair1, pair2) -> Tuple[List, DefaultDict[str, int]]:
'stack': StackProfiler, 'cpumemstackprofiler': CpuMemStackProfiler, 'both': CpuMemStackProfiler,'cpumemstack': CpuMemStackProfiler,
'stackcpumem': CpuMemStackProfiler, 'cpumemprofiler': CpuMemProfiler, 'stackprofiler': StackProfiler, 'cpumemstackprofiler': CpuMemStackProfiler
}

# Made at https://github.com/g1thubhub/phil_stopwatch by writingphil@gmail.com

0 comments on commit 55c5cbc

Please sign in to comment.
You can’t perform that action at this time.