In [8]:
import csv
import pandas as pd
from scipy.stats import pearsonr
import os.path
import matplotlib.pyplot as plt
encoding='iso-8859-1'

In [9]:
def get_loc(java_classes, project):
    with open(project+'/loc_analysis.csv', mode ='r', encoding=encoding)as file:
        csvFile = csv.reader(file)
        next(csvFile, None) #skip header
        for lines in csvFile:
            if len(lines) < 1:
                continue
            java_class = lines[0]
            java_class = java_class.replace('\\', '/')
            loc = lines[1]
            java_classes[java_class] = {'loc':loc}
    return java_classes, ["loc",]

In [10]:
def get_comments(java_classes, project, allow_new=True):
    count = 0
    with open(project+'/comments_in_code.csv', mode ='r', encoding=encoding)as file:
        csvFile = csv.reader(file)
        header = next(csvFile, None) #skip header
        for lines in csvFile:
            if len(lines) < 2:
                continue
            java_class = lines[0]
            java_class = java_class.replace('\\', '/')
            if not '/' in java_class:
                print( "may not be a class: ", java_class)
            if not java_class in java_classes:
                if allow_new:
                    java_classes[java_class] = {'loc': "null"}
                else:
                    continue
            for i in range(1, len(header)):
                java_classes[java_class][header[i]] = lines[i]
    #print( count ) #TODO: fix tis problem
    return java_classes, header[1:]

In [42]:
def get_pmd_violations(java_classes, project, allow_new=False):
    pmd_types = []
    with open(project+'/pmd_analysis.csv', mode ='r', encoding=encoding)as file:
        csvFile = csv.reader(file)
        next(csvFile, None) #skip header
        for lines in csvFile:
            if len(lines) < 3:
                continue
            java_class = lines[0]
            java_class = java_class.replace('\\', '/')
            java_class = java_class.replace("C:/dev/"+project, '')
            type = lines[1]
            desciption = lines[2]
            line_info = java_class.find(':')
            if line_info > 0:
                line = java_class[line_info+1:]
                java_class = java_class[:line_info]
            else:
                line = 'not applicable'
            if not java_class in java_classes:
                if allow_new:
                    java_classes[java_class] = {}
                else:
                    print("skipping class: ", lines[0])
                    print("also known as: ", java_class)
                    continue
            if not 'pmd_' + type in java_classes[java_class]:
                java_classes[java_class]['pmd_' + type ] = 1
            else:
                java_classes[java_class]['pmd_' + type ] += 1
            if not 'pmd_' + type in pmd_types:
                pmd_types.append('pmd_' + type)
    return java_classes, pmd_types

In [43]:
def get_sonar_violations(java_classes, project, allow_new=False):
    sonar_rules = []
    if not os.path.isfile(project+'/sonar_analysis.csv'):
        return java_classes, sonar_rules
    with open(project+'/sonar_analysis.csv', mode ='r', encoding=encoding)as file:
        csvFile = csv.reader(file)
        next(csvFile, None) #skip header
        for lines in csvFile:
            if len(lines) < 3:
                continue
            java_class = lines[0]
            java_class = java_class.replace('\\', '/')
            rule = lines[1]
            desciption = lines[2]
            if not java_class in java_classes:
                if not 'test' in java_class:
                    print("error here: ", java_class)
                continue
            if not 'sonar_' + rule in java_classes[java_class]:
                java_classes[java_class]['sonar_' + rule ] = 0
            java_classes[java_class]['sonar_' + rule ] += 1
            if not 'sonar_' + rule in sonar_rules:
                sonar_rules.append('sonar_' + rule)
        return java_classes, sonar_rules

In [45]:
all_projects = ['eclipse.platform', 'guava', 'guice', 'hadoop', 'spark', 'flow' ]
data = {}
for project in all_projects:
    print( "project: ", project)
    java_classes = {}
    java_classes, locs = get_loc(java_classes, project)
    java_classes, comments = get_comments(java_classes, project, allow_new=False)
    java_classes, pmd_types = get_pmd_violations(java_classes, project, allow_new=False)
    java_classes, sonar_rules = get_sonar_violations(java_classes, project, allow_new=False)
    with open(project+'/data.csv', mode='w', newline='') as output_file:
        writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        header = ["class",] + locs + comments + pmd_types + sonar_rules
        writer.writerow(header)
        for java_class, d in java_classes.items():
            l = [java_class,]
            skip = False
            for item in header:
                if item == "class":
                    continue
                if item in d:
                    if str(d[item]).lower() != 'null':
                        l.append(d[item])
                    else:
                        skip= True
                else:
                    if item in locs + comments:
                        print("class ", java_class, " is not in comments")
                        skip = True
                    else:
                        l.append(0)
            if not skip:
                writer.writerow( l )

project:  eclipse.platform
skipping class:  \ant\org.eclipse.ant.core\src_ant\org\eclipse\ant\internal\core\ant\InternalAntRunner.java:17:
also known as:  /ant/org.eclipse.ant.core/src_ant/org/eclipse/ant/internal/core/ant/InternalAntRunner.java
skipping class:  \ant\org.eclipse.ant.launching\remote\org\eclipse\ant\internal\launching\remote\InternalAntRunner.java:15:
also known as:  /ant/org.eclipse.ant.launching/remote/org/eclipse/ant/internal/launching/remote/InternalAntRunner.java
skipping class:  \ant\org.eclipse.ant.tests.core\tests\org\eclipse\ant\tests\core\AbstractAntTest.java:14:
also known as:  /ant/org.eclipse.ant.tests.core/tests/org/eclipse/ant/tests/core/AbstractAntTest.java
skipping class:  \debug\org.eclipse.debug.tests\src\org\eclipse\debug\tests\AutomatedSuite.java:16:
also known as:  /debug/org.eclipse.debug.tests/src/org/eclipse/debug/tests/AutomatedSuite.java
skipping class:  \debug\org.eclipse.debug.tests\src\org\eclipse\debug\tests\console\IOConsoleTests.java:14:

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



project:  spark
skipping class:  \common\network-common\src\test\java\org\apache\spark\network\RpcIntegrationSuite.java:18:
also known as:  /common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java
skipping class:  \common\network-common\src\test\java\org\apache\spark\network\sasl\SparkSaslSuite.java:18:
also known as:  /common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
skipping class:  \common\network-shuffle\src\test\java\org\apache\spark\network\shuffle\AppIsolationSuite.java:18:
also known as:  /common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/AppIsolationSuite.java
skipping class:  \common\network-shuffle\src\test\java\org\apache\spark\network\shuffle\ExternalBlockHandlerSuite.java:18:
also known as:  /common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalBlockHandlerSuite.java
skipping class:  \common\network-shuffle\src\test\java\org\apache\spark\network\shuffle\RemoteBl