In [1]:
import pandas as pd
import json
import requests
import matplotlib.pylab as plt
import numpy as np

from itertools import groupby
from datetime import datetime as dt

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# Classes

In [3]:
HETZNER_IP = '176.9.137.234'
UOM_PC_IP = '195.251.210.136'
UOM_DB_IP = '195.251.210.166'
RUG_IP = '129.125.5.170'
MACBOOK_IP = '145.97.138.131'
SONARQUBE_IP = HETZNER_IP

SONARQUBE_PORT = '9000'
REST_API_PORT = '8080'

COMMIT_STATS_API_URL = '/GitHubCommitStatsRest/webresources/commitstatsfiles/repo/'
COMMIT_STATS_API_CACHE_URL = '/GitHubCommitStatsCache/webresources/commitsstats/json/'
COMMIT_STATS_FILES_API_URL = '/api/commitStatsFiles'
COMMIT_STATS_API_URL = '/api/commitStats'
COMMIT_STATS_FILES_PER_REPO_API_URL = COMMIT_STATS_FILES_API_URL + '/repo/'
COMMIT_STATS_PER_REPO_API_URL = COMMIT_STATS_API_URL + '/repo/'
REST_URL = COMMIT_STATS_FILES_PER_REPO_API_URL

PROTOCOL = 'http://'

class GitHubProject:
    def __init__(self, repo_id, owner, repo, language, sonarqube_ip=SONARQUBE_IP, rest_api_ip = UOM_DB_IP):
        self.repo_id = repo_id
        self.owner = owner
        self.repo = repo
        self.language = language
        self.sonarqube_ip = sonarqube_ip
        self.rest_api_ip = rest_api_ip
        self.sonarqube_project_key = self.owner+':'+self.repo
        self.rest_key = self.owner+'/'+self.repo
        self.timemachine_metrics_url = PROTOCOL + self.sonarqube_ip + ':' + SONARQUBE_PORT + '/api/'+ 'timemachine?' + 'resource=' + self.sonarqube_project_key + '&metrics='
        self.commit_stats_url = PROTOCOL + self.rest_api_ip + ':' + REST_API_PORT + COMMIT_STATS_PER_REPO_API_URL + str(self.repo_id)
        self.commit_stats_files_url = PROTOCOL + self.rest_api_ip + ':' + REST_API_PORT + COMMIT_STATS_FILES_PER_REPO_API_URL + str(self.repo_id)

In [4]:
class CommitStatsFile:
    def __init__(self, fileName, additions, deletions, changes, status, commitDate):
        self.fileName = fileName
        self.additions = float(additions)
        self.deletions = float(deletions)
        self.changes = float(changes)
        self.status = status
        self.commitDate = commitDate


    def __repr__(self):
        return "<CommitStatsFile fileName:%s additions:%s deletions:%s changes:%s status:%s commitDate:%s>" % (self.fileName, self.additions, self.deletions, self.changes, self.status, self.commitDate)

    def __str__(self):
        return "From str method of CommitStatsFile: fileName:%s additions:%s deletions:%s changes:%s status:%s commitDate:%s>" % (self.fileName, self.additions, self.deletions, self.changes, self.status, self.commitDate)

In [5]:
class CommitStats:
    def __init__(self, id, additions, deletions, total, commitDate):
        self.id = id
        self.additions = float(additions)
        self.deletions = float(deletions)
        self.total = float(total)
        self.commitDate = commitDate


    def __repr__(self):
        return "<CommitStats id:%s additions:%s deletions:%s total:%s commitDate:%s>" % (self.id, self.additions, self.deletions, self.total, self.commitDate)

    def __str__(self):
        return "From str method of CommitStats: id:%s additions:%s deletions:%s total:%s commitDate:%s>" % (self.id, self.additions, self.deletions, self.total, self.commitDate)

# General functions

In [6]:
def get_rest_response(rest_url):
    response = requests.get(rest_url)
    return json.loads(response.text)

def jsonDate_to_date(date):
    return dt.fromtimestamp(date / 1e3)

def get_date(date):
    return pd.Timestamp(date).date()

# Functions for getting SonarQube timemachine metrics

In [7]:
def get_repos():
    repos = []
    
    repos.append(GitHubProject(160994,'apache','sling','Java'))
    repos.append(GitHubProject(160999,'apache','zookeeper','Java'))
    repos.append(GitHubProject(205414,'apache','felix','Java'))
#     repos.append(GitHubProject(206317,'apache','camel','Java'))
    repos.append(GitHubProject(206338,'apache','directory-shared','Java'))
    repos.append(GitHubProject(206350,'apache','cayenne','Java'))
    repos.append(GitHubProject(206368,'apache','poi','Java'))
    repos.append(GitHubProject(206378,'apache','commons-lang','Java'))
    repos.append(GitHubProject(206403,'apache','jackrabbit','Java'))
    repos.append(GitHubProject(206424,'apache','cassandra','Java'))
    repos.append(GitHubProject(206483,'apache','maven','Java'))
    repos.append(GitHubProject(318103,'apache','pdfbox','Java'))
    
    repos.append(GitHubProject(892275,'square','retrofit','Java',sonarqube_ip='145.97.154.22'))
    repos.append(GitHubProject(1553754,'eclipse','orion.server','Java',sonarqube_ip='145.97.154.22'))
    repos.append(GitHubProject(1936771,'google','truth','Java',sonarqube_ip='145.97.154.22'))
    repos.append(GitHubProject(5152285,'square','okhttp','Java',sonarqube_ip='145.97.154.22'))
    repos.append(GitHubProject(7508411,'ReactiveX','RxJava','Java',sonarqube_ip='145.97.154.22'))
    repos.append(GitHubProject(10230369,'google','auto','Java',sonarqube_ip='145.97.154.22'))
    
    
#     repos.append(GitHubProject(2740148,'apache','opennlp','Java',sonarqube_ip='129.125.5.41'))
#     repos.append(GitHubProject(32137005,'apache','openmeetings','Java',sonarqube_ip='129.125.5.41'))
    
#     repos.append(GitHubProject(55334027,'apache','tomcat85','Java',sonarqube_ip='129.125.5.172'))
#     repos.append(GitHubProject(60115659,'eclipse','xtext-eclipse','Java',sonarqube_ip='129.125.5.172'))
    
#     repos.append(GitHubProject(106310,'junit-team','junit4','Java'))
#     repos.append(GitHubProject(160994,'apache','sling','Java'))
#     repos.append(GitHubProject(160999,'apache','zookeeper','Java'))
#     repos.append(GitHubProject(205414,'apache','felix','Java'))
#     repos.append(GitHubProject(206317,'apache','camel','Java'))
#     repos.append(GitHubProject(206338,'apache','directory-shared','Java'))
#     repos.append(GitHubProject(206350,'apache','cayenne','Java'))
#     repos.append(GitHubProject(206357,'apache','pig','Java'))
#     repos.append(GitHubProject(206368,'apache','poi','Java'))
#     repos.append(GitHubProject(206378,'apache','commons-lang','Java'))
#     repos.append(GitHubProject(206387,'apache','activemq','Java'))
#     repos.append(GitHubProject(206402,'apache','httpclient','Java'))
#     repos.append(GitHubProject(206403,'apache','jackrabbit','Java'))
#     repos.append(GitHubProject(206412,'apache','wicket','Java'))
#     repos.append(GitHubProject(206424,'apache','cassandra','Java'))
#     repos.append(GitHubProject(206444,'apache','hive','Java'))
#     repos.append(GitHubProject(206483,'apache','maven','Java'))
#     repos.append(GitHubProject(230296,'apache','wss4j','Java'))
#     repos.append(GitHubProject(318103,'apache','pdfbox','Java'))
#     repos.append(GitHubProject(322018,'apache','aries','Java'))
#     repos.append(GitHubProject(460078,'angular','angular.js','JavaScript'))
#     repos.append(GitHubProject(507775,'elastic','elasticsearch','Java'))
#     repos.append(GitHubProject(605999,'twitter','flockdb','Scala'))
#     repos.append(GitHubProject(688352,'apache','jmeter','Java'))
#     repos.append(GitHubProject(732593,'cloudera','hue','Python'))
#     repos.append(GitHubProject(790031,'openstack','nova','Python'))
#     repos.append(GitHubProject(832681,'apache','tomcat70','Java'))
#     repos.append(GitHubProject(892275,'square','retrofit','Java'))
#     repos.append(GitHubProject(961036,'hibernate','hibernate-orm','Java'))
#     repos.append(GitHubProject(972205,'hibernate','hibernate-validator','Java'))
#     repos.append(GitHubProject(990281,'hibernate','hibernate-search','Java'))
#     repos.append(GitHubProject(1020639,'Pylons','pyramid','Python'))
#     repos.append(GitHubProject(1064563,'netty','netty','Java'))
#     repos.append(GitHubProject(1148753,'spring-projects','spring-framework','Java'))
#     repos.append(GitHubProject(1446467,'pypa','pip','Python'))
#     repos.append(GitHubProject(1463755,'spring-projects','spring-amqp-samples','Java'))
#     repos.append(GitHubProject(1553754,'eclipse','orion.server','Java'))
#     repos.append(GitHubProject(1553787,'eclipse','egit','Java'))
#     repos.append(GitHubProject(1553790,'eclipse','jgit','Java'))
#     repos.append(GitHubProject(1553793,'eclipse','linuxtools','Java'))
#     repos.append(GitHubProject(1608936,'hibernate','hibernate-ogm','Java'))
#     repos.append(GitHubProject(1936771,'google','truth','Java'))
#     repos.append(GitHubProject(1937202,'gitblit','gitblit','Java'))
#     repos.append(GitHubProject(2049379,'Netflix','curator','Java'))
#     repos.append(GitHubProject(2108481,'spring-projects','spring-security-oauth','Java'))
#     repos.append(GitHubProject(2153096,'apache','sqoop','Java'))
#     repos.append(GitHubProject(2198510,'apache','flume','Java'))
#     repos.append(GitHubProject(2211243,'apache','kafka','Java'))
#     repos.append(GitHubProject(2282376,'apache','giraph','Java'))
#     repos.append(GitHubProject(2304277,'spring-projects','spring-roo','Java'))
#     repos.append(GitHubProject(2340549,'playframework','playframework','Scala'))
#     repos.append(GitHubProject(2383782,'apache','oozie','Java'))
#     repos.append(GitHubProject(2442457,'apache','ambari','Java'))
#     repos.append(GitHubProject(2493904,'apache','tomcat','Java'))
#     repos.append(GitHubProject(2524488,'apache','accumulo','Java'))
#     repos.append(GitHubProject(2740148,'apache','opennlp','Java'))
#     repos.append(GitHubProject(2888818,'scala','scala','Scala'))
#     repos.append(GitHubProject(3604630,'eclipse','org.aspectj','Java'))
#     repos.append(GitHubProject(4164482,'django','django','Python'))
#     repos.append(GitHubProject(4368712,'jersey','jersey','Java'))
#     repos.append(GitHubProject(4416959,'apache','tapestry-5','Java'))
#     repos.append(GitHubProject(5152285,'square','okhttp','Java'))
#     repos.append(GitHubProject(5212656,'apache','odftoolkit','Java'))
#     repos.append(GitHubProject(5683653,'apache','drill','Java'))
#     repos.append(GitHubProject(5970647,'eclipse','dltk.core','Java'))
#     repos.append(GitHubProject(6207167,'mockito','mockito','Java'))
#     repos.append(GitHubProject(6235174,'conda','conda','Python'))
#     repos.append(GitHubProject(6296790,'spring-projects','spring-boot','Java'))
#     repos.append(GitHubProject(6766558,'Netflix','Hystrix','Java'))
#     repos.append(GitHubProject(6898381,'apache','cordova-android','Java'))
#     repos.append(GitHubProject(6929734,'openstack','cinder','Python'))
#     repos.append(GitHubProject(7053637,'python','mypy','Python'))
#     repos.append(GitHubProject(7437073,'apache','jena','Java'))
#     repos.append(GitHubProject(7508411,'ReactiveX','RxJava','Java'))
#     repos.append(GitHubProject(7691631,'docker','docker','Go'))
#     repos.append(GitHubProject(7748336,'apache','tomee','Java'))
#     repos.append(GitHubProject(9290699,'apache','tez','Java'))
#     repos.append(GitHubProject(9759448,'apache','cloudstack','Java'))
#     repos.append(GitHubProject(10230369,'google','auto','Java'))
#     repos.append(GitHubProject(10270250,'facebook','react','JavaScript'))
#     repos.append(GitHubProject(10616263,'apache','sis','Java'))
#     repos.append(GitHubProject(10637896,'apache','logging-log4j2','Java'))
#     repos.append(GitHubProject(11543457,'apache','falcon','Java'))
#     repos.append(GitHubProject(12169108,'google','closure-library','JavaScript'))
#     repos.append(GitHubProject(13414105,'python','peps','Python'))
#     repos.append(GitHubProject(13968333,'openstack','solum','Python'))
#     repos.append(GitHubProject(14135467,'apache','aurora','Java'))
#     repos.append(GitHubProject(14135470,'apache','storm','Java'))
#     repos.append(GitHubProject(15062869,'facebook','jest','JavaScript'))
#     repos.append(GitHubProject(15698466,'apache','jclouds','Java'))
#     repos.append(GitHubProject(15928650,'apache','struts','Java'))
#     repos.append(GitHubProject(16389681,'google','j2objc','Java'))
#     repos.append(GitHubProject(16977479,'apache','cxf','Java'))
#     repos.append(GitHubProject(17165658,'apache','spark','Scala'))
#     repos.append(GitHubProject(18845024,'google','closure-compiler','Java'))
#     repos.append(GitHubProject(19821157,'eclipse','birt','Java'))
#     repos.append(GitHubProject(20089857,'apache','hbase','Java'))
#     repos.append(GitHubProject(20300177,'google','guava','Java'))
#     repos.append(GitHubProject(20473418,'apache','phoenix','Java'))
#     repos.append(GitHubProject(20587599,'apache','flink','Java'))
#     repos.append(GitHubProject(21193524,'apache','calcite','Java'))
#     repos.append(GitHubProject(23418517,'apache','hadoop','Java'))
#     repos.append(GitHubProject(25623942,'apache','syncope','Java'))
#     repos.append(GitHubProject(26389457,'apache','tomcat80','Java'))
#     repos.append(GitHubProject(26525416,'apache','lens','Java'))
#     repos.append(GitHubProject(27911088,'apache','nifi','Java'))
#     repos.append(GitHubProject(29028775,'facebook','react-native','JavaScript'))
#     repos.append(GitHubProject(29102367,'junit-team','junit5','Java'))
#     repos.append(GitHubProject(30449481,'apache','tinkerpop','Java'))
#     repos.append(GitHubProject(31006158,'apache','ignite','Java'))
#     repos.append(GitHubProject(31524768,'facebook','nuclide','JavaScript'))
#     repos.append(GitHubProject(31976266,'apache','asterixdb','Java'))
#     repos.append(GitHubProject(32137005,'apache','openmeetings','Java'))
#     repos.append(GitHubProject(32805981,'eclipse','gyrex','Java'))
#     repos.append(GitHubProject(32848140,'apache','zeppelin','Java'))
#     repos.append(GitHubProject(32935745,'eclipse','che','Java'))
#     repos.append(GitHubProject(33024207,'apache','polygene-java','Java'))
#     repos.append(GitHubProject(34039690,'apache','groovy','Java'))
#     repos.append(GitHubProject(34839383,'apache','geode','Java'))
#     repos.append(GitHubProject(40508605,'facebook','relay','JavaScript'))
#     repos.append(GitHubProject(41348333,'apache','apex-core','Java'))
#     repos.append(GitHubProject(41348334,'apache','apex-malhar','Java'))
#     repos.append(GitHubProject(45896813,'apache','incubator-systemml','Java'))
#     repos.append(GitHubProject(47246081,'apache','brooklyn-server','Java'))
#     repos.append(GitHubProject(50229487,'apache','lucene-solr','Java'))
#     repos.append(GitHubProject(50904245,'apache','beam','Java'))
#     repos.append(GitHubProject(51905353,'apache','arrow','Java'))
#     repos.append(GitHubProject(55334027,'apache','tomcat85','Java'))
#     repos.append(GitHubProject(60115659,'eclipse','xtext-eclipse','Java'))
#     repos.append(GitHubProject(62117818,'apache','incubator-carbondata','Java'))
#     repos.append(GitHubProject(68273151,'apache','incubator-hivemall','Java'))
#     repos.append(GitHubProject(76634121,'apache','qpid-proton-j','Java'))
#     repos.append(GitHubProject(78186814,'apache','incubator-weex','Java'))
    
    return repos

# Functions for getting CommitFileStats

In [8]:
def get_commitfiles_stats_DataFrame(ghp):
    commit_files_stats_response = get_commit_files_stats_response(ghp)
    commitStatsFilesList = getCommitStatsFilesList(commit_files_stats_response)

    min_project_date = get_min_date(commit_files_stats_response)
    max_project_date = get_max_date(commit_files_stats_response)

    date_range_index = get_date_range_index(min_project_date, max_project_date)
    df = pd.DataFrame(index=date_range_index)
    groups = groupby(commitStatsFilesList, lambda csf: (get_date(csf.commitDate)))
    
    files_added = pd.Series(index=date_range_index)
    modified = pd.Series(index=date_range_index)
    removed = pd.Series(index=date_range_index)
    renamed = pd.Series(index=date_range_index)
    additions = pd.Series(index=date_range_index)
    deletions = pd.Series(index=date_range_index)
    changes = pd.Series(index=date_range_index)
    no_of_commits = pd.Series(index=date_range_index)
    
    for csf in commitStatsFilesList:
        no_of_commits[get_date(csf.commitDate)] = 0
    
    for csf in commitStatsFilesList:
        no_of_commits[get_date(csf.commitDate)] += 1

    for key, group in groups:
        grp = list(group)

        files_added[key] = 0
        modified[key] = 0
        removed[key] = 0
        renamed[key] = 0
        additions[key] = 0
        deletions[key] = 0
        changes[key] = 0
#         no_of_commits[key]=0
#         print(size(grp))
        i = 0
        for item in grp:
            if (endswith(item.fileName, '.'+ghp.language)):
                if(item.status == 'added'):
                    files_added[key] += 1
                elif(item.status == 'modified'):
                    modified[key] += item.changes

                    additions[key] += item.additions
                    deletions[key] += item.deletions
                    changes[key] += item.changes
                elif(item.status == 'removed'):
                    removed[key] += item.deletions
                elif(item.status == 'renamed'):
                    renamed[key] += item.changes
#             i+=1
#         print(i)
    
    df['files_added'] = files_added
    df['additions'] = additions
    df['deletions'] = deletions
    df['changes'] = changes
    df['modified'] = modified
    df['removed'] = removed
    df['renamed'] = renamed
    df['no_of_commits'] = no_of_commits

    return df

def get_commit_files_stats_response(ghp):
    return get_rest_response(ghp.commit_stats_files_url)

def getCommitStatsFilesList(commit_files_stats):
    commitStatsFilesList = []
    for item in commit_files_stats:
        csf = CommitStatsFile(
            item.get('fileName'),
            item.get('additions'),
            item.get('deletions'),
            item.get('changes'),
            item.get('status'),
            jsonDate_to_date(item.get('commitDate')))

        commitStatsFilesList.append(csf)
    return commitStatsFilesList

def get_min_date(commitStatsFiles):
    return get_date(jsonDate_to_date(commitStatsFiles[0].get('commitDate')))

def get_max_date(commitStatsFiles):
    return get_date(jsonDate_to_date(commitStatsFiles[len(commitStatsFiles) - 1].get('commitDate')))

def get_date_range_index(min_project_date, max_project_date):
    return pd.date_range(start=min_project_date, end=max_project_date, freq='D')

def endswith(file_name, suffix):
    return file_name.lower().endswith(suffix.lower())

# Functions for getting CommitStats

In [9]:
def get_commit_stats_DataFrame(ghp):
    commit_stats_response = get_commit_stats_response(ghp)
    commitStatsList = getCommitList(commit_stats_response)

    min_project_date = get_min_date(commit_stats_response)
    max_project_date = get_max_date(commit_stats_response)

    date_range_index = get_date_range_index(min_project_date, max_project_date)
    series = pd.Series(index=date_range_index)
    groups = groupby(commitStatsList, lambda csf: (get_date(csf.commitDate)))

    for key, group in groups:
        grp = list(group)
        series[key] = len(grp)

    return series

def get_commit_stats_response(ghp):
    return get_rest_response(ghp.commit_stats_url)

def getCommitList(commit_stats):
    commitStatsList = []
    for item in commit_stats:
        csf = CommitStats(
            item.get('id'),
            item.get('additions'),
            item.get('deletions'),
            item.get('total'),
            jsonDate_to_date(item.get('commitDate')))

        commitStatsList.append(csf)
    return commitStatsList

# Plot functions

In [10]:
def plot(plt):
    plt.plot(figsize=(15,10))

def do_commit_analysis(df):
    
    df.fillna(0, inplace=True)

    df2 = df[['files_added','changes']]
    df3=df2.rolling(window=30,center=False).mean()
    df4=df3.rolling(window=90,center=False).mean()
    df5=df3.rolling(window=90,center=False).sum()

    draw_secondary_log_axis(df3, 'changes')
    draw_secondary_log_axis(df4, 'changes')
    draw_secondary_log_axis(df5, 'changes')
    effort_df4 = df4['changes'] / (df4['files_added'] + 1)

    effort_df4.plot(figsize=(15,10))
    effort_df4.rolling(window=365,center=False).mean().plot()
    plt.show()
    
def draw_secondary_log_axis(df, secondary_column):
    ax = df.plot(secondary_y=secondary_column,figsize=(15,10))
    ax2 = ax.twinx()
    ax2.set_yscale('log')
    plt.show()

In [11]:
def plotHists(series, step):
    no_of_days = len(series)
    for i in range(0, no_of_days, step):
        cs = series[i:i+step]
        cs = [each for each in cs if each >= 0]
#         print (cs)
#         plt.hist(cs)
        plt.figure()
        plt.hist(cs, bins=100)
        ylim(0,100)
        xlim(0,100)
        plt.axvline(pd.Series(cs).median(), color='r', linestyle='dashed')




In [12]:
ghp = GitHubProject(160994,'apache','sling','Java')
# ghp = GitHubProject(46271785,'digeo','UoMSecretarySpring','Java')
commit_stats = get_commit_stats_DataFrame(ghp)

# commit_stats.value_counts().plot.bar()
# commit_stats.plot(kind='hist')
# commit_stats.value_counts().plot()

#commit_stats.hist(bins=commit_stats.max() - commit_stats.min())

#plotHists(commit_stats,7)
plotHists(commit_stats,365)
# print(commit_stats)

# repos = get_repos()
#     for ghp in repos:
#         commit_stats = get_commit_stats_DataFrame(ghp)
#         plotHists(commit_stats,7)
#         plotHists(commit_stats,365)

ConnectionError: HTTPConnectionPool(host='195.251.210.166', port=8080): Max retries exceeded with url: /api/commitStats/repo/160994 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x1132bb630>: Failed to establish a new connection: [Errno 61] Connection refused',))

In [None]:
x=[1,2,2,2,2,22,5,3,4]
plt.hist(x,bins=100)
ylim(0,50)
xlim(0,100)

plt.axvline(pd.Series(x).median(), color='r', linestyle='dashed')


In [None]:
pd.Series(x).median()