refactor

#!/usr/bin/env python3
import io, os, sys, time, argparse

from difflib import SequenceMatcher as SM
from difflib import ndiff as ndiff

#------------------------------------------------------------------------------

parser = argparse.ArgumentParser()
parser.add_argument('--min-block-size', default=1500, type=int)
parser.add_argument('--max-block-diff', default=500, type=int)
parser.add_argument('--similarity', default=0.8, type=float)
parser.add_argument('--diff-width', default=180, type=int)
parser.add_argument('--no-cleanup', action='store_true')
parser.add_argument('--all-indents', action='store_true')
parser.add_argument('--all-files', action='store_true')
parser.add_argument('--python', action='store_true')
parser.add_argument('-f', action='store_true') # file filters
parser.add_argument('-p', action='store_true') # parse only
parser.add_argument('-b', action='store_true') # show blocks
parser.add_argument('-d', action='store_true') # debug
parser.add_argument('-o', '--output')

args, inputs = parser.parse_known_args()

filters = 'c,cpp,h,hh,hpp,py,go,rs,java,js,swift,php'

if not inputs:
   inputs = sys.stdin.read().split('\n')
   filters = [ f.strip() for f in filters.split(',') ]
   filters = ['.%s' % f for f in filters]
   if args.f:
      inputs = [ f for f in inputs if
           any([ f.endswith(e) for e in filters ])
      ]
   #end
#end

#------------------------------------------------------------------------------
#
#  Misc
#
#------------------------------------------------------------------------------

def spinning_cursor():
    while True:
      for cursor in ['[|]','[/]','[-]','[\\]']:
        yield cursor

spinner = spinning_cursor()
mlen = 0
sidx = 0
cols = 0
tty = 0

if sys.stdout.isatty():
   cols = os.get_terminal_size().columns-1
   tty = 1
#end

def prints(s, rate=1, force=0):
    if not tty: return
    global mlen, sidx
    sidx += 1
    if not (sidx % rate) or force:
       s = ( next(spinner) + ' ' + s )
       if len(s) > cols: s = s[:cols-4] + '...'
       sys.stdout.write(s + ' ' * (cols - len(s)) + '\r')
       sys.stdout.flush()

#------------------------------------------------------------------------------
#
#  Parsing
#
#------------------------------------------------------------------------------

class block:
   def __init__(self, file, line, indent, parent):
       self.file = file
       self.size = 0
       self.range = [line,-1]
       self.indent = indent
       self.parent = parent
       self.code = ""
       self.children = []
       if self.parent: parent.children.append(self)
   #end

   def add_code(self, c):
       self.code += c
       if self.parent: self.parent.add_code(c)
   #end
#end

#-- Python --------------------------------------------------------------------

def process_py(fn, line, ln, indent):

    global stack, blocks

    if args.d:
       print(fn, '%5d' % ln, '%2d' % indent, end='')
       print(' [%s]' % ('.' * len(stack)), line, end='')
    #end

    top = stack[-1]

    if not line.rstrip() or line[indent] in ['#','"',"'"]:
       top.add_code(line)
       return
    #end

    if indent == top.indent: top.add_code(line)

    if indent > top.indent:
       b = block(fn, ln, indent, top)
       b.add_code(line)
       stack.append(b)
       blocks.append(b)
    #end

    if indent < top.indent:
       if not stack:
          print('[!] %s:%d' % (fn, ln)) # error
          return False
       while 1:
          b = stack.pop()
          b.range[1] = ln
          top = stack[-1]
          if indent >= top.indent: break
       top.add_code(line)
    #end

#end

def process_pfile(fn, f):

    global state, ln

    b = block(fn, 1, 0, None)
    stack.append(b)
    blocks.append(b)

    prints(fn)

    for ln, line in enumerate(f):
        ln = ln+1
        indent = 0
        for ch in line:
            if ch.isspace(): indent += 1
            else: break
        process_py(fn, line, ln, indent)
    #end

#end

#-- C/C++ (curly bracket languages) -------------------------------------------

code     = 0 # normal code
macro    = 1 # #define ...
string1  = 2 # "abc"
string2  = 3 # 'd'
comment1 = 4 # // comment
comment2 = 5 # /* comment */

state  = code
indent = 0
stack  = []
blocks = []

def process_ch(fn, ch, ln):

    global state, indent, stack, blocks

    if args.d:
       print(fn, '%5d' % ln, state, end='')
       print('  ' if ch.isspace() else ' %s' % ch, end='')
       print(' [%s]' % ('*' * len(stack)))
    #end

    top = stack[-1] if stack else None
    # start of block
    if ch == '{' and state == code:
       indent += 1
       b = block(fn, ln, indent, top)
       stack.append(b)
       blocks.append(b)
    #end

    # end of block
    if ch == '}' and state == code:
       indent -= 1
       if not stack:
          print('[!] %s:%d' % (fn, ln)) # error
          return False
       b = stack.pop()
       b.range[1] = ln
       top = stack[-1] if stack else 0
    #end

    if top: top.add_code(ch)

    return True;

#end

def process_cfile(fn, f):

    global state, ln

    ch = 0; pch = 0

    while 1:

      ch = f.read(1)
      if not ch: break
      if ch == '\n': ln += 1

      if state == code:
         if ch == '#': state = macro
         if ch == '"': state = string1
         if ch == "'": state = string2
         if ch == '/':
            if not process_ch(fn, ch, ln): break

            ch = f.read(1)
            if not ch: break
            if ch == '\n': ln += 1

            if ch == '/': state = comment1
            if ch == '*': state = comment2

      elif state == string1:
         if ch == '"': state = code

      elif state == string2:
         if ch == "'": state = code

      elif state == comment1 or state == macro:
         if ch == '\n' and pch != '\\': state = code

      elif state == comment2:
         if ch == '/' and pch == '*': state = code
      #end

      if not process_ch(fn, ch, ln): break

      pch = ch

    #end

    if indent or stack: print('[!] %s:%d' % (fn, ln)) # error

#end

#------------------------------------------------------------------------------
#
#  Main
#
#------------------------------------------------------------------------------

for fn in inputs:

   ln = 1
   state = code
   stack = []
   indent = 0
   mlen = max(len(fn), mlen)

   if not os.path.isfile(fn): continue

   try:
      
      prints(fn)

      if fn.endswith('.py') or args.python:
         with open(fn) as f: process_pfile(fn, f)
      else:
         with open(fn) as f: process_cfile(fn, f)
      #end

   except KeyboardInterrupt:
      sys.exit(1)
   except UnicodeDecodeError:
      pass

#end

# filter out small blocks
blocks = [ b for b in blocks if len(b.code) > args.min_block_size ]

#
# debugging
#
if args.p or args.b:
   print(len(blocks), 'blocks')
   if args.b:
      for b in blocks:
          hlen = len(b.file)+30
          print('-'*hlen)
          print('%s:%d - %d'%(b.file,b.range[0],b.indent))
          print('-'*hlen)
          for line in b.code: print(line,end="")
          print()
      #end
   #end
   sys.exit(0)
#end

#------------------------------------------------------------------------------
#
#  Processing
#
#------------------------------------------------------------------------------


groups = { 0:[] }

for b in blocks:
    bf = b.file
    groups[0] += [b] # global group
    if bf in groups:
       groups[bf] += [b]
    else:
       groups[bf] = [b]
    #end
#end

groups = {
   k : sorted(groups[k], key=lambda x: len(x.code), reverse=True)
   for k in groups
}

skip = []

def skip_children(b):
    global skip
    for ch in b.children:
      if ch in blist:
        skip += [blist.index(ch)]
        skip_children(ch)
    #end
#end

def ancestor(a, b):
    p = a.parent
    while p:
       if b == p: return 1
       p = p.parent
    #end
    return 0
#end

fcache = {}

# return true to skip block comparison based on filenames
def skip_unrelated_files(f1, f2):
    if args.all_files: return False
    global fcache
    if (f1,f2) in fcache: return fcache[(f1,f2)]
    if (f2,f1) in fcache: return fcache[(f2,f1)]
    if f1 == f2:
       return False
    else:
       if SM(None, f1, f2).ratio() < 0.9:
          fcache[(f1,f2)] = True
          return True
       else:
          fcache[(f1,f2)] = False
          return False
       #end
    #end
#end

similar = []

keys = {0} # use the global block list for now

for k in keys:

   blist = groups[k]

   L = len(blist)

   for i in range(0, L):

      for j in range(i+1, L):

         prints('comparing blocks %4d - %4d %s' % (i,L,k), rate=500)

         # skip if either block is a child of matched block
         if i in skip or j in skip: continue

         bi = blist[i]; bj = blist[j]

         # skip if filenames are unrelated
         if skip_unrelated_files(bi.file, bj.file): continue

         # skip if one block is a parent of another
         if ancestor(bi, bj) or ancestor(bj, bi): continue

         # skip if the length of code sections differs by a threshold
         if abs(len(bi.code)-len(bj.code)) > args.max_block_diff: continue

         # skip if blocks are not on the same indent
         if not args.all_indents and bi.indent != bj.indent: continue

         # do the comparison (skip if blocks are not similar)
         sm = SM(None, bi.code, bj.code)
         if sm.quick_ratio() < args.similarity: continue
         if sm.ratio()       < args.similarity: continue

         # found similar blocks - add their children to the skip list
         skip_children(bi); skip_children(bj)

         # save the similar blocks for post-processing
         similar += [(bi,bj)] if len(bj.code) > len(bi.code) else [(bj,bi)]

      #end

   #end

#end

similar = sorted(similar, key=lambda x: len(x[0].code), reverse=True)

#------------------------------------------------------------------------------
#
#  HTML formatting helpers
#
#------------------------------------------------------------------------------

VSP = '&#9474' # vertical separator
GRN = '<span style="background:#ccffcc">'
RED = '<span style="background:#ffcccc">'
GRY = '<span style="color:#cccccc">'
YEL = '<span style="color:#ffd480">'
ESP = '</span>'

def esc(s): return s.replace('<', '&lt;')

def diff2html(df, bi, bj, index):

    html = ""
    width = args.diff_width
    middle = int(width/2)

    DIV  = GRY + VSP + ESP
    DIV2 = YEL + VSP + ESP
 
    # section links
    html += '<pre style='
    html += '"background:white;caret-color:transparent;'
    html += 'border:none;text-align:center">\n'
    html += '<span '
    html += 'id="%s"' % index
    html += 'style="">'
    if index == 1000:
       html += ' <a href="#%s">next</a>' % (index + 1)
       html += '</span>'
       html += ' ' * (width - 4)
    else:
       html += ' <a href="#%s">next</a>' % (index + 1) 
       html += ' <a href="#%s">prev</a>' % (index - 1)
       html += ' <a href="#%s">up</a>' % (1000)
       html += '</span>'
       html += ' ' * (width - 12)
    #end

    html += '</pre>'

    # section file headers
    html += '<pre style='
    html += '"background:white;caret-color:transparent;'
    html += 'border:none;text-align:center">'
    html += '<span style="border:1px #ccc solid;'
    html += 'border-radius:3px;padding:5px;font-weight:bold">'
    H1 = ' %s:%d' % (bi.file, bi.range[0])
    H2 = ' %s:%d' % (bj.file, bj.range[0])
    html += ( H1 + ' '*((middle-1)-len(H1)))
    html += VSP
    html += ( H2 + ' '*(middle-len(H2)) )
    html += '</span></pre>\n'

    # diff
    html += '<pre style='
    html += '"background:white;caret-color:transparent;'
    html += 'border:none;text-align:center">\n'

    for line in df:

        line = line[:-1]
        line += ' '*(width-len(line))

        dm = line[middle-1]
        L1 = line[:middle-1]
        L2 = line[middle:]

        if dm == ' ':
           L1 = esc(L1); L2 = esc(L2)
           html += "%s%s%s\n" % (L1, DIV, L2)
        else:
           if dm == '|':
              nd = ndiff(L1, L2)
              S1 = ''
              S2 = ''
              for i in nd:
                 ch = esc(i[2])
                 if i[0] == ' ':
                    S1 += ch
                    S2 += ch
                 if i[0] == '+':
                    if ch.isspace():
                       S2 += ch
                    else:
                       S2 += GRN + ch + ESP
                 if i[0] == '-':
                    if ch.isspace():
                       S1 += ch
                    else:
                       S1 += RED + ch + ESP
                 #end
              #end
              html += '%s%s%s\n' % (S1, DIV2, S2)
           else:
              L1 = esc(L1); L2 = esc(L2)
              if dm == '<':
                 html += '%s%s%s%s%s\n' % (RED, L1, ESP, DIV, L2)
              if dm == '>':
                 html += '%s%s%s%s%s\n' % (L1, DIV, GRN, L2, ESP)
           #end

        #end

    #end

    html += '</pre>\n'

    return html

#end

#------------------------------------------------------------------------------
#
#  Post processing
#
#------------------------------------------------------------------------------

prints('post processing')

DN = '2>/dev/null'
root = '/tmp/refactor-%d' % os.getpid()
os.system('mkdir -p %s/diffs' % root)
os.system('mkdir -p %s/files' % root)

def dash(fname): return fname.replace('/', '-')

index = 1000

for (bi, bj) in similar:

    fstr = '%s/files/%d-%s-%d-%d'
    farg = (root, index, dash(bi.file), bi.range[0], bi.range[1])
    bifn = fstr % farg
    farg = (root, index, dash(bj.file), bj.range[0], bj.range[1])
    bjfn = fstr % farg
    with open(bifn, 'w') as f: f.write(bi.code)
    with open(bjfn, 'w') as f: f.write(bj.code)

    dif = '%s/diffs/%d.diff' % (root, index)
    cmd = 'diff -t -y --width=%d %s %s > %s'
    cmd = cmd % (args.diff_width, bifn, bjfn, dif)
    os.system(cmd)

    htf = dif.replace('.diff', '.html')
    with open(dif, 'r') as f: html = diff2html(f, bi, bj, index)
    with open(htf, 'w') as f: f.write(html)

    index += 1

#end

prints('', force=1)

nsimilar = len(similar)

print(nsimilar, 'similar blocks found')

if not nsimilar: sys.exit(0)

output = ('refactor-%d.html' % os.getpid())

if args.output: output = args.output

cmd = 'cat %s/diffs/*.html > %s %s ' % (root, output, DN)
os.system(cmd)
print('>', output)

if not args.no_cleanup: os.system('rm -rf %s' % root)