# From Git Log to CSV
### This Notebook has to be placed in the Cockpit Repository and will create a CSV file containing alle path based code changes between 2 defined Tags

In [236]:
#!/usr/bin/env python3
import fileinput
import re
import subprocess

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm

## Get all Commits between 2 Tags

### prepare commits messages

In [237]:
start_tag = '2.0.0'
end_tag = '4.0.0'
file_name = 'git-log-' + start_tag + '-' + end_tag + '.csv'
!git log --no-merges {end_tag}...{start_tag} > {file_name}
all_tags = !git tag

In [238]:
# remove wrong commit words
a_file = open(file_name, "r")
lines = a_file.readlines()
a_file.close()

new_file = open(file_name, "w")
for line in lines:
    if ' commit ' or 'turning autocommit on and off' not in line:
        new_file.write(line)
new_file.close()

In [239]:
# remove line breaks
with fileinput.FileInput(file_name, inplace=True, mode='r') as file:
    for line in file:
        print(line.replace('\n', ''), end='')

In [240]:
# remove ,
with fileinput.FileInput(file_name, inplace=True, mode='r') as file:
    for line in file:
        print(line.replace(',', ''), end='')

In [241]:
# set every commit in a new line
with fileinput.FileInput(file_name, inplace=True, mode='r') as file:
    for line in file:
        print(line.replace('commit ', '\n'), end='')

In [242]:
# print how many commits were found
num_lines = sum(1 for line in open(file_name))
print(str(num_lines) + ' commits found between ' + start_tag + ' and ' + end_tag)

6627 commits found between 2.0.0 and 4.0.0


In [243]:
# replace Author: with ,
with fileinput.FileInput(file_name, inplace=True, mode='r') as file:
    for line in file:
        print(line.replace('Author: ', ','), end='')

In [244]:
# print how many valid commits
num_lines = sum(1 for line in open(file_name))
print(str(num_lines) + ' valid commits found between ' + start_tag + ' and ' + end_tag)

6627 valid commits found between 2.0.0 and 4.0.0


In [245]:
# remove wrong lines
a_file = open(file_name, "r")
lines = a_file.readlines()
a_file.close()

new_file = open(file_name, "w")
for line in lines:
    if '>' in line:
        new_file.write(line)
new_file.close()

In [246]:
# replace Date:    with ,
with fileinput.FileInput(file_name, inplace=True, mode='r') as file:
    for line in file:
        print(line.replace('Date:   ', ','), end='')

In [247]:
# remove lines without date
a_file = open(file_name, "r")
lines = a_file.readlines()
a_file.close()

new_file = open(file_name, "w")
for line in lines:
    if '00    ' in line:
        new_file.write(line)
new_file.close()

In [248]:
# set , behind date
with fileinput.FileInput(file_name, inplace=True, mode='r') as file:
    for line in file:
        print(line.replace('00    ', '00,'), end='')

In [249]:
# print how many valid commits
num_lines = sum(1 for line in open(file_name))
print(str(num_lines) + ' valid commits found between ' + start_tag + ' and ' + end_tag)

6568 valid commits found between 2.0.0 and 4.0.0


# Get the diff for every commit, extract its features and write every new path in a new row

### add diff to commits

In [250]:
original_data = pd.read_csv(file_name, names=["commit", "author", "date", "message"])
# add columns
original_data['diffLength'] = ''
original_data['isBugfix'] = ''
original_data['isBuggy'] = ''
original_data['path'] = ''
original_data['pathLength'] = ''
original_data['pathType'] = ''
original_data['tagIndex'] = ''
# JS Features (57,7% of CodeBase is JS)
original_data['countArguments'] = ''
original_data['countAwait'] = ''
original_data['countBreak'] = ''
original_data['countCatch'] = ''
original_data['countDebugger'] = ''
original_data['countElse'] = ''
original_data['countEnum'] = ''
original_data['countExport'] = ''
original_data['countExtends'] = ''
original_data['countFor'] = ''
original_data['countFunctions'] = ''
original_data['countIfs'] = ''
original_data['countImport'] = ''
original_data['countInterface'] = ''
original_data['countLet'] = ''
original_data['countNew'] = ''
original_data['countPrivate'] = ''
original_data['countProtected'] = ''
original_data['countPublic'] = ''
original_data['countReturn'] = ''
original_data['countStatic'] = ''
original_data['countThis'] = ''
original_data['countThrow'] = ''
original_data['countVar'] = ''
# PHP Features (29,8 of CodeBase is PHP)
original_data['countAbstract'] = ''
original_data['countGoto'] = ''
original_data['countInclude'] = ''
original_data['countPrint'] = ''
original_data['countRequire'] = ''
original_data['countYield'] = ''

original_data.describe()

Unnamed: 0,commit,author,date,message,diffLength,isBugfix,isBuggy,path,pathLength,pathType,...,countStatic,countThis,countThrow,countVar,countAbstract,countGoto,countInclude,countPrint,countRequire,countYield
count,6568,6568,6568,6568,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,...,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0
unique,6568,14,6567,6038,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
top,4f7ddab23962a62c1bf2c8883d734e3b61aa965b,Daniel Borchers <daniel.borchers@fsz.de>,Wed Apr 17 12:35:35 2019 +0200,CHANGE - DocumentManagement - pre release for ...,,,,,,,...,,,,,,,,,,
freq,1,3443,2,18,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,...,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0,6568.0


In [251]:
def getPath(string): 
    keyword = ' '
    before_keyword, keyword, after_keyword = string.partition(keyword)
    return before_keyword

def getPathType(string): 
    path, pathType = os.path.splitext(string)
    return pathType

def getTagIndex(commit):
    tag = !git describe $commit
    tag = str(tag)
    keyword = '-'
    before_keyword, keyword, after_keyword = tag.partition(keyword)
    tag = before_keyword
    tag = tag.replace("\n", "")
    tag = tag.replace(" ", "")
    tag = tag.replace("['", "")
    tag = tag.replace("']", "")
    tagIndex = all_tags.index(tag)
    tagIndex
    return tagIndex


In [252]:
# iterate through each row and write features to dataframe
restString = ''
isBugfix = 0
for x in range(len(original_data.index)):
#for x in range(20):
    try:
        commit = original_data['commit'][x]
        diff = subprocess.run(['git', 'show',  commit], stdout=subprocess.PIPE).stdout.decode('utf-8')
        y = 0
        n = diff.count('diff --git a/')
        for y in range(n):
            keyword = 'diff --git a/'
            before_keyword, keyword, after_keyword = diff.partition(keyword)
            mystring2 = after_keyword
            if re.search('bugfix', original_data['message'][x], re.IGNORECASE):
                isBugfix = 1
            else:
                isBugfix = 0
            if y == n:
                diff = after_keyword
            else:
                before_keyword, keyword, after_keyword = mystring2.partition(keyword) 
                string = before_keyword
                diff = after_keyword
            if y < 1:
                original_data['path'][x] = getPath(string)
                original_data['pathType'][x] = getPathType(original_data['path'][x])
                original_data['isBugfix'][x] = isBugfix
                original_data['diffLength'][x] = len(string)
                original_data['countFunctions'][x] = string.count('function() ')

                original_data['countArguments'][x] = string.count('Arguments')
                original_data['countAwait'][x] = string.count('Await')
                original_data['countBreak'][x] = string.count('Break')
                original_data['countCatch'][x] = string.count('Catch')
                original_data['countDebugger'][x] = string.count('Debugger')
                original_data['countElse'][x] = string.count('Else')
                original_data['countEnum'][x] = string.count('Enum')
                original_data['countExport'][x] = string.count('Export')
                original_data['countExtends'][x] = string.count('Extends')
                original_data['countFor'][x] = string.count('For')
                original_data['countIfs'][x] = string.count('If')
                original_data['countImport'][x] = string.count('Import')
                original_data['countInterface'][x] = string.count('Interface')
                original_data['countLet'][x] = string.count('Let')
                original_data['countNew'][x] = string.count('New')
                original_data['countPrivate'][x] = string.count('Private')
                original_data['countProtected'][x] = string.count('Protected')
                original_data['countPublic'][x] = string.count('Public')
                original_data['countReturn'][x] = string.count('Return')
                original_data['countStatic'][x] = string.count('Static')
                original_data['countThis'][x] = string.count('This')
                original_data['countThrow'][x] = string.count('Throw')
                original_data['countVar'][x] = string.count('Var')

                original_data['countAbstract'][x] = string.count('Abstract')
                original_data['countGoto'][x] = string.count('Goto')
                original_data['countInclude'][x] = string.count('Include')
                original_data['countPrint'][x] = string.count('Print')
                original_data['countRequire'][x] = string.count('Require')
                original_data['countYield'][x] = string.count('Yield')

                original_data['tagIndex'][x] = getTagIndex(original_data['commit'][x])
            else:
                new_ind = len(original_data.index) + 1
                path = getPath(string)
                original_data.loc[new_ind] = {'commit': original_data['commit'][x], 'author': original_data['author'][x], 'date': original_data['date'][x], 'message': original_data['message'][x], 'path': getPath(string), 'pathType': getPathType(path)}
                original_data['isBugfix'][new_ind] = isBugfix
                original_data['diffLength'][new_ind] = len(string)
                original_data['countFunctions'][new_ind] = string.count('function() ')

                original_data['countArguments'][x] = string.count('arguments')
                original_data['countAwait'][x] = string.count('await')
                original_data['countBreak'][x] = string.count('break')
                original_data['countCatch'][x] = string.count('catch')
                original_data['countDebugger'][x] = string.count('debugger')
                original_data['countElse'][x] = string.count('else')
                original_data['countEnum'][x] = string.count('enum')
                original_data['countExport'][x] = string.count('export')
                original_data['countExtends'][x] = string.count('extends')
                original_data['countFor'][x] = string.count('for')
                original_data['countIfs'][x] = string.count('if')
                original_data['countImport'][x] = string.count('import')
                original_data['countInterface'][x] = string.count('interface')
                original_data['countLet'][x] = string.count('let')
                original_data['countNew'][x] = string.count('new')
                original_data['countPrivate'][x] = string.count('private')
                original_data['countProtected'][x] = string.count('protected')
                original_data['countPublic'][x] = string.count('public')
                original_data['countReturn'][x] = string.count('return')
                original_data['countStatic'][x] = string.count('static')
                original_data['countThis'][x] = string.count('this')
                original_data['countThrow'][x] = string.count('throw')
                original_data['countVar'][x] = string.count('var')

                original_data['countAbstract'][x] = string.count('abstract')
                original_data['countGoto'][x] = string.count('goto')
                original_data['countInclude'][x] = string.count('include')
                original_data['countPrint'][x] = string.count('print')
                original_data['countRequire'][x] = string.count('require')
                original_data['countYield'][x] = string.count('yield')

                original_data['tagIndex'][new_ind] = getTagIndex(original_data['commit'][new_ind])
    except:
        pass
original_data

Unnamed: 0,commit,author,date,message,diffLength,isBugfix,isBuggy,path,pathLength,pathType,...,countStatic,countThis,countThrow,countVar,countAbstract,countGoto,countInclude,countPrint,countRequire,countYield
0,4f7ddab23962a62c1bf2c8883d734e3b61aa965b,thomas <thomas.korte@fsz.de>,Tue Nov 3 14:13:59 2020 +0100,Bugfix: fixed issue where users without full C...,545,1,,php/base/module/detail/DynamicFormActionModule...,,.php,...,0,0,0,0,0,0,0,0,0,0
1,9e12609d756fb3718e6ef23e859e4bfa89266f0c,Daniel Borchers <daniel.borchers@fsz.de>,Tue Nov 3 10:50:51 2020 +0100,ENHANCEMENT - CiBrowserOverview - schema switc...,1658,0,,resources/js/base/CiBrowser/CiBrowserOverview.js,,.js,...,0,0,0,0,0,0,0,0,0,0
2,4d4a292da7cbac135492260fa467e17a9a04de51,Daniel Borchers <daniel.borchers@fsz.de>,Tue Nov 3 10:50:19 2020 +0100,BUGFIX - DynamicForm - ActionController - remo...,922,1,,resources/js/base/BaseView/FormView/DynamicFor...,,.js,...,0,0,0,0,0,0,0,0,0,0
3,b51799cca9b75b69bd5e25ca5fe3cecd3658eb22,Daniel Borchers <daniel.borchers@fsz.de>,Tue Nov 3 09:40:36 2020 +0100,BUGFIX - messageCatalog - hardened code,561,1,,resources/js/base/helper/MessageCatalog.js,,.js,...,0,0,0,0,0,0,0,0,0,0
4,9064f0108f758a13cbf334f11d0fbc376e3e4052,Daniel Borchers <daniel.borchers@fsz.de>,Tue Nov 3 09:37:22 2020 +0100,ENHANCEMENT - ItemType - added schema switch,1493,0,,php/base/service/overview/ItemTypePermissionOv...,,.php,...,0,6,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8794,e197f6299d2c1b297f7d324142364e3c3b7e1222,Daniel Borchers <daniel.borchers@fsz.de>,Mon Mar 5 15:17:41 2018 +0100,ENHANCEMENT - TreeMenu - duplication of nodes ...,3354,0,,resources/js/base/form/Button.js,,.js,...,,,,,,,,,,
8795,e197f6299d2c1b297f7d324142364e3c3b7e1222,Daniel Borchers <daniel.borchers@fsz.de>,Mon Mar 5 15:17:41 2018 +0100,ENHANCEMENT - TreeMenu - duplication of nodes ...,1212,0,,resources/js/base/panel/admin/MetaModelEditor/...,,.js,...,,,,,,,,,,
8796,e197f6299d2c1b297f7d324142364e3c3b7e1222,Daniel Borchers <daniel.borchers@fsz.de>,Mon Mar 5 15:17:41 2018 +0100,ENHANCEMENT - TreeMenu - duplication of nodes ...,9650,0,,resources/js/base/tree/TreeNodeUI.js,,.js,...,,,,,,,,,,
8797,e197f6299d2c1b297f7d324142364e3c3b7e1222,Daniel Borchers <daniel.borchers@fsz.de>,Mon Mar 5 15:17:41 2018 +0100,ENHANCEMENT - TreeMenu - duplication of nodes ...,1751,0,,resources/js/cockpit/fbplugins/DataTablePlugin...,,.js,...,,,,,,,,,,


In [253]:
# safe data
df = original_data
df['tagIndex'] = pd.to_numeric(df['tagIndex'])
df['isBugfix'] = pd.to_numeric(df['isBugfix'])
df['isBuggy'] = 0
df['countFunctions'] = pd.to_numeric(df['countFunctions'])

df['countArguments'] = pd.to_numeric(df['countArguments'])
df['countAwait'] = pd.to_numeric(df['countAwait'])
df['countBreak'] = pd.to_numeric(df['countBreak'])
df['countCatch'] = pd.to_numeric(df['countCatch'])
df['countDebugger'] = pd.to_numeric(df['countDebugger'])
df['countElse'] = pd.to_numeric(df['countElse'])
df['countEnum'] = pd.to_numeric(df['countEnum'])
df['countExport'] = pd.to_numeric(df['countExport'])
df['countExtends'] = pd.to_numeric(df['countExtends'])
df['countFor'] = pd.to_numeric(df['countFor'])
df['countFunctions'] = pd.to_numeric(df['countFunctions'])
df['countIfs'] = pd.to_numeric(df['countIfs'])
df['countImport'] = pd.to_numeric(df['countImport'])
df['countInterface'] = pd.to_numeric(df['countInterface'])
df['countLet'] = pd.to_numeric(df['countLet'])
df['countNew'] = pd.to_numeric(df['countNew'])
df['countPrivate'] = pd.to_numeric(df['countPrivate'])
df['countProtected'] = pd.to_numeric(df['countProtected'])
df['countPublic'] = pd.to_numeric(df['countPublic'])
df['countReturn'] = pd.to_numeric(df['countReturn'])
df['countStatic'] = pd.to_numeric(df['countStatic'])
df['countThis'] = pd.to_numeric(df['countThis'])
df['countThrow'] = pd.to_numeric(df['countThrow'])
df['countVar'] = pd.to_numeric(df['countVar'])
# PHP Features (29,8 of CodeBase is PHP)
df['countAbstract'] = pd.to_numeric(df['countAbstract'])
df['countGoto'] = pd.to_numeric(df['countGoto'])
df['countInclude'] = pd.to_numeric(df['countInclude'])
df['countPrint'] = pd.to_numeric(df['countPrint'])
df['countRequire'] = pd.to_numeric(df['countRequire'])
df['countYield'] = pd.to_numeric(df['countYield'])

In [254]:
x = 0
for x in range(len(df.index)):
#for x in range(20):
    try:
        path = df['path'][x]
        tagIndex = df['tagIndex'][x]
        tagIndex = tagIndex -1
        test = df.loc[(df['path'] == path) & (df['tagIndex'] == tagIndex) & (df['isBugfix'] == 1)]
        if test.empty:
            pass
        else:
            df.at[x, 'isBuggy'] =1
            del test
    except:
        pass

In [255]:
df.describe(include='all')

Unnamed: 0,commit,author,date,message,diffLength,isBugfix,isBuggy,path,pathLength,pathType,...,countStatic,countThis,countThrow,countVar,countAbstract,countGoto,countInclude,countPrint,countRequire,countYield
count,8798,8798,8798,8798,8798.0,8725.0,8798.0,8798,6568.0,8798,...,6495.0,6495.0,6495.0,6495.0,6495.0,6495.0,6495.0,6495.0,6495.0,6495.0
unique,6568,14,6567,6038,3850.0,,,1830,1.0,46,...,,,,,,,,,,
top,3e675de750f61456d7049b74030b392c72b6810e,Daniel Borchers <daniel.borchers@fsz.de>,Thu Oct 8 16:56:35 2020 +0200,ENHANCEMENT - removed old svn header,,,,.gitattributes,,.js,...,,,,,,,,,,
freq,205,5022,205,206,73.0,,,795,6568.0,3561,...,,,,,,,,,,
mean,,,,,,0.372264,0.010002,,,,...,0.018322,3.187375,0.04157,0.618168,0.014011,0.001232,0.054503,0.029099,0.175982,0.002309
std,,,,,,0.483436,0.099516,,,,...,0.346752,49.219884,1.381093,12.373269,0.617939,0.072347,1.113148,0.682337,2.32744,0.107442
min,,,,,,0.0,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,,,0.0,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,,,,,0.0,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,,,,,1.0,0.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [256]:
df['isBuggy'].sum()

88

## write Dataframe to CSV file

In [262]:
new = df
new.pop('pathLength')
new.fillna(0, inplace=True)
new.to_csv('df.csv', index=False)