-
Notifications
You must be signed in to change notification settings - Fork 0
/
svn_import.sh
executable file
·271 lines (216 loc) · 8.95 KB
/
svn_import.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/bin/bash
# Collection of various functions handy for importing SVN repository to git.
#
# General usage: ./svn_import <function> <params>
#
############################################################3
# Import SVN repo using svn2git tool.
# starting with rev 805 - precursor of version 1.6.0
function import {
OPT="-v --revision 805 --notags --authors authors --exclude 'JB-deal.ii-richards' --exclude 'branches/JB-richards-anal' --exclude 'PE-IntepolationFactory' --exclude 'third_party/armadillo.' --exclude '.*[.]pos' --exclude 'third_party/boost.*'"
if [ "$1" == "rebase" ]
then
svn2git --rebase $OPT
else
svn2git https://dev.nti.tul.cz/repos/flow123d $OPT
fi
}
################################################################
# Call gc; determine size of the repository
function repo_size {
git gc
git bundle create tmp.bundle --all
du -sh tmp.bundle
}
################################################################
# prints 'HEAD' if the first parameter is in HEAD (?? not sure if we go through all branches)
function is_file_in_head {
if git ls-tree -r HEAD | grep -q "$1"
then
echo "HEAD"
else
echo "old"
fi
}
################################################################
# List all files larger then $SIZE_LIMIT together with
# filename, blob hash and compressed size
# Output the list into 'bigtosmall.tmp'
function list_large_files {
SIZE_LIMIT=200000
git rev-list --objects --all | sort -k 2 > allfileshas.txt
git verify-pack -v .git/objects/pack/pack-*.idx | grep -v chain | awk -v lim=$SIZE_LIMIT '$3 >lim' | sort -k3nr >bigobjects.txt
for SHA in `cut -f 1 -d\ < bigobjects.txt`; do
file=$(grep $SHA allfileshas.txt | awk '{print $2}')
echo $file $(is_file_in_head $file) $(grep $SHA bigobjects.txt) >> bigtosmall.tmp
done;
cat bigtosmall.tmp | column -t > bigtosmall.txt
rm bigtosmall.tmp
}
################################################################
# Removes file $1 from the history. Filter branch only from the first commit
# to speedup the process.
#
# Special treatment for the initial commit, since it has no parent.
function remove_from_history {
set -x
export FILE_TO_DELETE=$1
SHA=`git log --all --oneline -- $FILE_TO_DELETE | awk '{print $1}' | tail -n 1`
if [ -n "$SHA" ]
then
initial_commit=`git rev-list --oneline --max-parents=0 HEAD | awk '{print $1}'`
if [ "$SHA" == "$initial_commit" ]
then
range=""
else
range="HEAD ^$SHA"
fi
# go through from the first commit of the file, in all branches
git filter-branch -f --prune-empty --index-filter 'git rm -rf --cached --ignore-unmatch $FILE_TO_DELETE' --tag-name-filter cat -- --all $range
fi
}
################################################################
# Usage:
# 1) call list_large_files
# 2) possibly remove or add some lines in bigtosmall.txt
# 3) call clean_history - this removes every file/directory in first column of bigtosmall.txt; that is not in HEAD
#
# This works, but is rather slow; faster solution could pass just once through the history
# and at every commit delete files from bigtosmall.txt
function clean_history {
# take only those, that are not in HEAD
cat bigtosmall.txt | grep -v "HEAD" \
| while read file rest
do
remove_from_history $file
done
}
##################################################################
# This works very well, but still miss some files.
function clean_history_2 {
export files=`cat bigtosmall.txt | grep -v "HEAD"`
git filter-branch -f --prune-empty --index-filter 'git rm -rf --cached --ignore-unmatch $files' --tag-name-filter cat -- --all
}
#function prune_history {
# while
# list_large_files
#}
function repo_export {
TARGET=$1
if [ -n "$TARGET" ]
then
rm -rf ${TARGET}
git clone --no-hardlinks ./.git $TARGET
cp ./svn_import.sh $TARGET
fi
}
################################################################
# Pass through the history and delete big files. Fails since we have to delete the file also in
# subsequent commits.
#
#
# Pass through all commits (in all branches), at every commit (save the HEAD) delete all files larger then $1.
# This should work, but it doesn't. Namelly the largest file test_units/mesh/pokus7.msh, is not part of tree at HEAD so
# git rm is not called for it, which is the only difference to:
# git filter-branch ... --index-filter 'git rm ... -rf test_units/mesh/pokus7.msh' -- --all
# ... which works.
#
# Conclusion: use list_large_files or list_large_files_full to select files;
# then delete them through whole history (exclude files present at HEAD)
# possibly can be combined with remove_large_files - which also has some impact on the size (150 -> 120MB)
#
function remove_large_files {
export SIZE_LIMIT=$1
echo "SIZE LIMIT: $SIZE_LIMIT"
# only big files in tree
# ls-tree fields: <mode> SP <type> SP <object> SP <object size> TAB <file>
git filter-branch --prune-empty --tree-filter '
#set -x;
head_id=`git rev-parse HEAD`;
TREE_BIG_FILES="`git ls-tree -rl $commit | awk -v lim=$SIZE_LIMIT "\\$4 > lim" | grep -v "/gtest-1.6.0" | grep -v "src/" | awk "{print \\$5}"`";
git rm -rf --cached --ignore-unmatch $TREE_BIG_FILES;
if [ "$GIT_COMMIT" != "$head_id" ];
then
git rm -rf --cached --ignore-unmatch third_party/* >/dev/null;
fi
' --tag-name-filter cat -- --all
}
############################################################
# Inefficient version of git rev-list ...
function get_blob_refs {
obj_name="$1"
shift
git log "$@" --pretty=format:'%T %h %s' \
| while read tree commit subject ; do
if git ls-tree -r $tree | grep -q "$obj_name" ; then
echo $commit "$subject"
fi
done
}
################################################################
# Slow and complex function to get informations provided by list_large_files
# together with all commits wher the file was changed.
#
# More easily this additional information (if needed) is provided by commad:
# git rev-list --all --oneline -- $file_path
#
function list_large_files_full {
# Shows you the largest objects in your repo's pack file.
# (use git gc before)
#
# @see http://stubbisms.wordpress.com/2009/07/10/git-script-to-show-largest-pack-objects-and-trim-your-waist-line/
# @author Antony Stubbs
# set the internal field spereator to line break, so that we can iterate easily over the verify-pack output
# list all objects including their size, sort by size, take top 10
objects=``
echo "All sizes are in kB's. The pack column is the size of the object, compressed, inside the pack file."
output="size,pack,SHA,location"
min_size=200000
# verify-pack fields: sha object_type size compressed
#BIG_FILES="`git verify-pack -v .git/objects/pack/pack-*.idx | grep -v chain | awk -v lim=$min_size '$3 >$lim' | sort -k3nr`"
BIG_FILES="`cat xx | grep -v chain `" #| awk -v lim=$min_size '$3 >lim' | sort -k3nr`"
rm -f missing_big
rm -f size_mismatch
rm -f found_files
# go through commits
git log --pretty=format:'%T %H' \
| while read tree commit remains
do
# only big files in tree
# ls-tree fields: <mode> SP <type> SP <object> SP <object size> TAB <file>
TREE="`git ls-tree -rl $commit | awk -v lim=$min_size '$4 > lim' | sort -k4nr`"
# list files changed in current commit
git show --pretty="format:" --name-only $commit | grep -v '^ *$' > FILES
# go through intersection (changed big files)
echo "$TREE" | grep -f FILES \
| while read a b hash size file rest
do
size=$(( $size / 1024 ))
# get blob size and compressed size from report of packed repository
line=`echo "$BIG_FILES" | grep "$hash"`
fields=( $line )
if [ "${#fields[@]}" -gt "0" ]
then
# detect if the file is in HEAD
if git ls-tree -r HEAD | grep -q $hash
then
HEAD="HEAD"
else
HEAD="old"
fi
pack_size=$(( ${fields[2]} / 1024))
compressed_size=$(( ${fields[3]} / 1024 ))
echo "$size,$pack_size,$compressed_size,$file,$HEAD,$commit" >>found_files
# echo "$line" >> big_found
else
echo "$commit $hash $size $file" >>missing_big
fi
done
done
cat found_files | column -t -s ", " | sort -nr > potentialy_big
}
#########################################################################
FUNC=$1
shift
${FUNC} "$@"
#git filter-branch -f --prune-empty --index-filter 'git rm -rf --cached --ignore-unmatch doc/reference_manual/tests_graphics/mel_167s-170_velocity.pdf' --tag-name-filter cat -- 2c56273..