Skip to content

Commit

Permalink
Add support for parallel analysis of repositories
Browse files Browse the repository at this point in the history
When running on multiple large repositories, gitlogg used to
analyze these in a sequential manner. This is fine as long as disk
speed is the bottle neck. Calling git log is a CPU expensive
operation, however, so even single SSD machines can benefit from a
performance boost by parallelizing the work.

Per default, we start n-1 sub processes, where n is the computer's
number of CPUs. This ensures we do not overload the system in case
of a desktop computer. With the -n parameter, the user can however
specify an arbitrary number of processes to spawn.

This mandated the following changes:

 - Refactoring out the core part of the script to be able to spawn
   it via xargs.

 - Refactoring out the terminal color setup to use it from both the
   main and the worker script.

 - Adding a CLI option parser and usage info.
  • Loading branch information
Inventitech committed Nov 21, 2016
1 parent b3f7f98 commit 99d58d3
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 34 deletions.
16 changes: 16 additions & 0 deletions gitlogg/colors.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/bash

# Extracted command line colors for nicer output

# Text Reset
RCol='\033[0m'

# Regular Bold Underline High Intensity BoldHigh Intens Background High Intensity Backgrounds
Bla='\033[0;30m'; BBla='\033[1;30m'; UBla='\033[4;30m'; IBla='\033[0;90m'; BIBla='\033[1;90m'; On_Bla='\033[40m'; On_IBla='\033[0;100m';
Red='\033[0;31m'; BRed='\033[1;31m'; URed='\033[4;31m'; IRed='\033[0;91m'; BIRed='\033[1;91m'; On_Red='\033[41m'; On_IRed='\033[0;101m';
Gre='\033[0;32m'; BGre='\033[1;32m'; UGre='\033[4;32m'; IGre='\033[0;92m'; BIGre='\033[1;92m'; On_Gre='\033[42m'; On_IGre='\033[0;102m';
Yel='\033[0;33m'; BYel='\033[1;33m'; UYel='\033[4;33m'; IYel='\033[0;93m'; BIYel='\033[1;93m'; On_Yel='\033[43m'; On_IYel='\033[0;103m';
Blu='\033[0;34m'; BBlu='\033[1;34m'; UBlu='\033[4;34m'; IBlu='\033[0;94m'; BIBlu='\033[1;94m'; On_Blu='\033[44m'; On_IBlu='\033[0;104m';
Pur='\033[0;35m'; BPur='\033[1;35m'; UPur='\033[4;35m'; IPur='\033[0;95m'; BIPur='\033[1;95m'; On_Pur='\033[45m'; On_IPur='\033[0;105m';
Cya='\033[0;36m'; BCya='\033[1;36m'; UCya='\033[4;36m'; ICya='\033[0;96m'; BICya='\033[1;96m'; On_Cya='\033[46m'; On_ICya='\033[0;106m';
Whi='\033[0;37m'; BWhi='\033[1;37m'; UWhi='\033[4;37m'; IWhi='\033[0;97m'; BIWhi='\033[1;97m'; On_Whi='\033[47m'; On_IWhi='\033[0;107m';
56 changes: 24 additions & 32 deletions gitlogg/gitlogg-generate-log.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
#!/usr/bin/env bash
#!/bin/bash

# Text Reset
RCol='\033[0m'
source "colors.sh"

# Regular Bold Underline High Intensity BoldHigh Intens Background High Intensity Backgrounds
Bla='\033[0;30m'; BBla='\033[1;30m'; UBla='\033[4;30m'; IBla='\033[0;90m'; BIBla='\033[1;90m'; On_Bla='\033[40m'; On_IBla='\033[0;100m';
Red='\033[0;31m'; BRed='\033[1;31m'; URed='\033[4;31m'; IRed='\033[0;91m'; BIRed='\033[1;91m'; On_Red='\033[41m'; On_IRed='\033[0;101m';
Gre='\033[0;32m'; BGre='\033[1;32m'; UGre='\033[4;32m'; IGre='\033[0;92m'; BIGre='\033[1;92m'; On_Gre='\033[42m'; On_IGre='\033[0;102m';
Yel='\033[0;33m'; BYel='\033[1;33m'; UYel='\033[4;33m'; IYel='\033[0;93m'; BIYel='\033[1;93m'; On_Yel='\033[43m'; On_IYel='\033[0;103m';
Blu='\033[0;34m'; BBlu='\033[1;34m'; UBlu='\033[4;34m'; IBlu='\033[0;94m'; BIBlu='\033[1;94m'; On_Blu='\033[44m'; On_IBlu='\033[0;104m';
Pur='\033[0;35m'; BPur='\033[1;35m'; UPur='\033[4;35m'; IPur='\033[0;95m'; BIPur='\033[1;95m'; On_Pur='\033[45m'; On_IPur='\033[0;105m';
Cya='\033[0;36m'; BCya='\033[1;36m'; UCya='\033[4;36m'; ICya='\033[0;96m'; BICya='\033[1;96m'; On_Cya='\033[46m'; On_ICya='\033[0;106m';
Whi='\033[0;37m'; BWhi='\033[1;37m'; UWhi='\033[4;37m'; IWhi='\033[0;97m'; BIWhi='\033[1;97m'; On_Whi='\033[47m'; On_IWhi='\033[0;107m';
# Display system usage and exit
usage()
{
cat <<\USAGE_EOF
usage: gitlogg-generate-log.sh [options]
The following options are available
-n X: Run X instances of gitlogg concurrently
USAGE_EOF
exit 2
}

test "$1" = "--help" && usage

# Use n-1 processors per default, so the system is not overloaded
NUM_THREADS=$(($(getconf _NPROCESSORS_ONLN)-1))
test $NUM_THREADS -lt 1 && NUM_THREADS=1

test "$1" = "-n" && NUM_THREADS=$2
echo -e "${Blu}Info: Calculating in $NUM_THREADS thread(s)${RCol}"

# define the absolute path to the directory that contains all your repositories.
yourpath=../repos/
Expand All @@ -29,6 +38,7 @@ esac
# 'thepath' sets the path to each repository under 'yourpath' (the trailing asterix [*/] represents all the repository folders).
thepath="${yourpathSanitized}*/"


# function to trim whitespace
trim() {
local var="$*"
Expand Down Expand Up @@ -56,26 +66,8 @@ SECONDS=0
# if the path exists and is not empty
if [ -d "${yourpathSanitized}" ] && [ "$(ls $yourpathSanitized)" ]; then
echo -e "${Yel}Generating ${Pur}git log ${Yel}for ${reporef} located at ${Red}'${thepath}'${Yel}. ${Blu}This might take a while!${RCol}"
for dir in $thepath
do
(cd $dir &&
echo -e "${Whi}Outputting ${Pur}${PWD##*/}${RCol}" >&2 &&
git log --all --no-merges --shortstat --reverse --pretty=format:'commits\trepository\t'"${PWD##*/}"'\tcommit_hash\t%H\tcommit_hash_abbreviated\t%h\ttree_hash\t%T\ttree_hash_abbreviated\t%t\tparent_hashes\t%P\tparent_hashes_abbreviated\t%p\tauthor_name\t%an\tauthor_name_mailmap\t%aN\tauthor_email\t%ae\tauthor_email_mailmap\t%aE\tauthor_date\t%ad\tauthor_date_RFC2822\t%aD\tauthor_date_relative\t%ar\tauthor_date_unix_timestamp\t%at\tauthor_date_iso_8601\t%ai\tauthor_date_iso_8601_strict\t%aI\tcommitter_name\t%cn\tcommitter_name_mailmap\t%cN\tcommitter_email\t%ce\tcommitter_email_mailmap\t%cE\tcommitter_date\t%cd\tcommitter_date_RFC2822\t%cD\tcommitter_date_relative\t%cr\tcommitter_date_unix_timestamp\t%ct\tcommitter_date_iso_8601\t%ci\tcommitter_date_iso_8601_strict\t%cI\tref_names\t%d\tref_names_no_wrapping\t%D\tencoding\t%e\tsubject\t%s\tsubject_sanitized\t%f\tcommit_notes\t%N\tstats\t' |
sed '/^[ \t]*$/d' | # remove all newlines/line-breaks, including those with empty spaces
tr '\n' '' | # convert newlines/line-breaks to a character, so we can manipulate it without much trouble
tr '\r' ' ' | # replace carriage returns with a space, so we avoid new lines popping from placeholders that allow user input
sed 's/t貌commits/t貌貌commits/g' | # because some commits have no stats, we have to create an extra line-break to make `paste -d ' ' - -` consistent
tr '' '\n' | # bring back all line-breaks
sed '{
N
s/[)]\n\ncommits/)\
commits/g
}' | # some rogue mystical line-breaks need to go down to their knees and beg for mercy, which they're not getting
paste -d ' ' - - | # collapse lines so that the `shortstat` is merged with the rest of the commit data, on a single line
awk '{print NR"\\t",$0}' | # print line number in front of each line, along with the `\t` delimiter
sed 's/\\t\ commits\\trepo/\\t\commits\\trepo/g' # get rid of the one space that shouldn't be there
)
done > gitlogg.tmp
dirs=$(ls -d $thepath)
echo $dirs | xargs -n 1 -P $NUM_THREADS ./output-intermediate-gitlog.sh > gitlogg.tmp
echo -e "${Gre}The file ${Blu}./gitlogg.tmp ${Gre}generated in${RCol}: ${SECONDS}s" &&
babel gitlogg-parse-json.js | node # only parse JSON if we have a source to parse it from
# if the path exists but is empty
Expand Down
4 changes: 2 additions & 2 deletions gitlogg/gitlogg.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/env bash
#!/bin/bash

bash gitlogg-generate-log.sh
bash gitlogg-generate-log.sh $@
26 changes: 26 additions & 0 deletions gitlogg/output-intermediate-gitlog.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# Returns an intermediate representation of git log with the given repository to stdout

source "colors.sh"

test "$1" || exit 1
dir=$1

cd $dir &&
echo -e "${Whi}Outputting ${Pur}${PWD##*/}${RCol}" >&2 &&
git log --all --no-merges --shortstat --reverse --pretty=format:'commits\trepository\t'"${PWD##*/}"'\tcommit_hash\t%H\tcommit_hash_abbreviated\t%h\ttree_hash\t%T\ttree_hash_abbreviated\t%t\tparent_hashes\t%P\tparent_hashes_abbreviated\t%p\tauthor_name\t%an\tauthor_name_mailmap\t%aN\tauthor_email\t%ae\tauthor_email_mailmap\t%aE\tauthor_date\t%ad\tauthor_date_RFC2822\t%aD\tauthor_date_relative\t%ar\tauthor_date_unix_timestamp\t%at\tauthor_date_iso_8601\t%ai\tauthor_date_iso_8601_strict\t%aI\tcommitter_name\t%cn\tcommitter_name_mailmap\t%cN\tcommitter_email\t%ce\tcommitter_email_mailmap\t%cE\tcommitter_date\t%cd\tcommitter_date_RFC2822\t%cD\tcommitter_date_relative\t%cr\tcommitter_date_unix_timestamp\t%ct\tcommitter_date_iso_8601\t%ci\tcommitter_date_iso_8601_strict\t%cI\tref_names\t%d\tref_names_no_wrapping\t%D\tencoding\t%e\tsubject\t%s\tsubject_sanitized\t%f\tcommit_notes\t%N\tstats\t' |
sed '/^[ \t]*$/d' | # remove all newlines/line-breaks, including those with empty spaces
tr '\n' '' | # convert newlines/line-breaks to a character, so we can manipulate it without much trouble
tr '\r' ' ' | # replace carriage returns with a space, so we avoid new lines popping from placeholders that allow user input
sed 's/t貌commits/t貌貌commits/g' | # because some commits have no stats, we have to create an extra line-break to make `paste -d ' ' - -` consistent
tr '' '\n' | # bring back all line-breaks
sed '{
N
s/[)]\n\ncommits/)\
commits/g
}' | # some rogue mystical line-breaks need to go down to their knees and beg for mercy, which they're not getting
paste -d ' ' - - | # collapse lines so that the `shortstat` is merged with the rest of the commit data, on a single line
awk '{print NR"\\t",$0}' | # print line number in front of each line, along with the `\t` delimiter
sed 's/\\t\ commits\\trepo/\\t\commits\\trepo/g' # get rid of the one space that shouldn't be there

0 comments on commit 99d58d3

Please sign in to comment.