In [1]:
import git
import pandas as pd
import re
import time

def get_history(git_path):
    
    starttime = time.time()
    repo = git.Repo(git_path)
    
    commits = pd.DataFrame(columns=[
        'sha',
        'authored_datetime',
        'author',
        'email',
        'message',
        'deletions',
        'insertions',
        'filename',
        'rename_from',
        'rename_to',
        'change_type'
    ])
        
    for c in repo.iter_commits('master'):

        diff_files = {}
        
        # save diff data for changes files
        for d in c.diff():
            diff_files[d.a_path] = {}
            diff_files[d.a_path]['rename_to'] = d.rename_to
            diff_files[d.a_path]['rename_from'] = d.rename_from

        stats = c.stats
        files = stats.files

        for filename in files.keys():
            modifications = files[filename]

            data = [
                str(c),
                pd.to_datetime(c.authored_datetime),
                c.author.name,
                c.author.email,
                c.message,
                modifications['deletions'],
                modifications['insertions']
                ]

            #### Treat renames
            
            # add renames that were detected by Git
            if filename in diff_files:
                rename_to = diff_files[filename]['rename_to']
                new_filename = rename_to if rename_to else filename
                # filename is renamed name
                data.append(new_filename)
                data.append(diff_files[filename]['rename_from'])
                data.append(diff_files[filename]['rename_to'])
                data.append("R")
                
            # add renames that were not detected by Git, but are extractable by filename    
            elif "=>" in filename:
                # only parts are renamed
                if "{" in filename and "}" in filename:
                    replacement = re.search('{(.*) => (.*)?}', filename)
                    assert replacement, "Replacement pattern for part rename doesn't work for filename " + filename
                    assert len(replacement.groups()) == 2, "Didn't find correct groups for part replacement"

                    rename_from = re.sub("{.*}", replacement.group(1), filename).replace("//", "/")
                    rename_to = re.sub("{.*}", replacement.group(2), filename).replace("//", "/")
                    change_type = "R"
                    # filename is renamed name
                    data.append(rename_to)
                    data.append(rename_from)
                    data.append(rename_to)
                    data.append(change_type)
                # whole file was renamed
                else:
                    replacement = re.search('(.*) => (.*)', filename)
                    assert replacement, "Replacement pattern for full rename doesn't work for filename " + filename
                    assert len(replacement.groups()) == 2, "Didn't find correct groups for full replacement"

                    rename_from = replacement.group(1)
                    rename_to = replacement.group(2)
                    change_type = "R"
                    # filename is renamed name
                    data.append(rename_to)
                    data.append(rename_from)
                    data.append(rename_to)
                    data.append(change_type)

                    
            # fill all non-renames with empty values      
            else:
                data.append(filename)
                for _ in range(0, 3):    
                    data.append(None)
                
            assert len(data)==len(commits.columns), \
                "Columns missing for data (is " + str(len(data)) + \
                ", but must be " + str(len(commits.columns)) + ". data="+ str(data)
            
            commits.loc[len(commits)] = data

    return commits


commits = get_history(r'C:\dev\repos\DropOverDemo')
commits.head()

Unnamed: 0,sha,authored_datetime,author,email,message,deletions,insertions,filename,rename_from,rename_to,change_type
0,58abf7ff8c8d45f5ada667a22a972d33914a8791,2016-11-04 20:17:09,feststelltaste,feststelltaste@googlemail.com,add IntelliJ to ignores\n,1.0,2.0,masterbuild/.gitignore,,,
1,58abf7ff8c8d45f5ada667a22a972d33914a8791,2016-11-04 20:17:09,feststelltaste,feststelltaste@googlemail.com,add IntelliJ to ignores\n,1.0,2.0,largeTests/.gitignore,,,
2,58abf7ff8c8d45f5ada667a22a972d33914a8791,2016-11-04 20:17:09,feststelltaste,feststelltaste@googlemail.com,add IntelliJ to ignores\n,1.0,2.0,backend/.gitignore,,,
3,58abf7ff8c8d45f5ada667a22a972d33914a8791,2016-11-04 20:17:09,feststelltaste,feststelltaste@googlemail.com,add IntelliJ to ignores\n,1.0,2.0,mediumTests/.gitignore,,,
4,8c686954e96a77fa1d59985762e2320dba1a8101,2016-07-22 17:43:38,Hausmann,Michael@Michael-PC,letzte Version von Michael,143.0,0.0,backend/src/main/webapp/app/libs/angular-1.5.7...,,,


In [2]:
commits = commits.iloc[::-1]
commits.head()

Unnamed: 0,sha,authored_datetime,author,email,message,deletions,insertions,filename,rename_from,rename_to,change_type
25913,45b1bf8e74870c7bb5b2d68c35eb5cc2de4a801c,2012-11-17 18:22:28,chris,chris@192.168.1.19,Initial,0.0,14.0,masterbuild/pom.xml,,,R
25912,45b1bf8e74870c7bb5b2d68c35eb5cc2de4a801c,2012-11-17 18:22:28,chris,chris@192.168.1.19,Initial,0.0,17.0,masterbuild/.project,,,R
25911,5d78e56ac122de71064a18ff757ab15c9ee333c1,2012-11-17 18:22:55,chris,chris@192.168.1.19,Initial,0.0,4.0,masterbuild/.settings/org.eclipse.m2e.core.prefs,,,R
25910,11b20729d170db8fc9c211f6b157a10b52baa190,2012-11-17 18:29:34,chris,chris@192.168.1.19,Initial,0.0,21.0,backend/pom.xml,,,R
25909,11b20729d170db8fc9c211f6b157a10b52baa190,2012-11-17 18:29:34,chris,chris@192.168.1.19,Initial,0.0,5.0,backend/target/m2e-wtp/web-resources/META-INF/...,,,R


In [3]:
def track_renames(row):

    rename_from = row['rename_from']
    rename_to = row['rename_to']
    if len(commits[commits['rename_to'] == rename_from]) > 0:
        return commits[commits['rename_to'] == rename_from].iloc[0]['source_id']
    elif row['rename_to']:
        return row.name
    else: 
        None

commits['source_id'] = commits[['rename_from', 'rename_to']].apply(track_renames, axis=1)
commits.head()

KeyError: ('source_id', 'occurred at index 25705')

In [None]:
changes = commits[commits['filename'].str.endswith(".java")][['filename', 'source_id', 'author']].\
    drop_duplicates().groupby(['filename']).agg({'source_id' : 'count', 'author' : 'count'}).\
    rename(columns={'source_id' : 'renames', 'author' : 'authors'})
changes['changes'] = changes['authors'] - changes['renames']
changes.head(10)

In [None]:
assert False, "deactivated"

import git
import pandas as pd
import re
import time

def get_history(git_path):
    
    starttime = time.time()
    repo = git.Repo(git_path)
    commit_list = []

    index_counter = 0

    for c in repo.iter_commits('master', max_count=30, skip=1):

        diff_files = {}
        
        # save diff data for changes files
        for d in c.diff():
            diff_files[d.a_path] = {}
            diff_files[d.a_path]['rename_to'] = d.rename_to
            diff_files[d.a_path]['rename_from'] = d.rename_from

        stats = c.stats
        files = stats.files

        for filename in files.keys():
            modifications = files[filename]

            index_counter = index_counter + 1
            commit = pd.DataFrame(index=[index_counter])
            commit['sha'] = str(c)
            commit['timestamp'] = pd.to_datetime(c.authored_datetime)
            commit['author'] =  c.author.name
            commit['email'] = c.author.email
            commit['message'] = c.message
            commit['deletions'] = modifications['deletions']
            commit['insertions'] = modifications['insertions']
            
            commit_list.append(commit)
            
            
        
    return pd.concat(commit_list)


commits = get_history(r'C:\dev\repos\DropOverDemo')
commits.head()