In [1]:
import os
import csv
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
from statistics import mean, mode, median
from matplotlib import cm, colors
from distfit import distfit
from inflection import humanize, underscore
import json
from math import sqrt
import statistics


%matplotlib inline

In [2]:
def getData(metric:str, root_path:str):
    data = []
    for j in tqdm(range(len(os.listdir(root_path))), desc=metric):
        folder = os.listdir(root_path)[j]
        system_path = os.path.join(root_path, folder)
        metric_path = os.path.join(system_path, metric+'.csv')
        with open(metric_path, 'r') as csv_file:
            csv_reader = csv.reader(csv_file)
            for line in csv_reader:
                if(line[-1] != 'NaN' and line[-1] != 'nan'):
                    if(float(line[-1]) >= 0 ):
                        data.append(float(line[-1]))
    data = np.sort(data)
    
    return data

In [3]:
def statistic_metric(data, porcentagem):
    if (len(data) == 0) :
        return [0,0,0,0,0,0,(0,0)]
    metric_dict = []
    avg = np.average(data)
    median = np.median(data)
    mode = statistics.mode(data)
    std = np.std(data)
    x = porcentagem
    k = sqrt(-1/(x - 1))
    limite_inf = avg - k*std
    limite_sup = avg + k*std
    
    percentile70 = np.percentile(data, 70)
    percentile90 = np.percentile(data, 90)
    
    metric_dict = [
        avg,
        std,
        median,
        mode,
        percentile70,
        percentile90,
        (limite_inf, limite_sup)
    ]
    
    return metric_dict

In [4]:
def aparar_arr(arr, porcentagem):
    index_inf = round(len(arr) * porcentagem) - 1
    index_sup = round(len(arr) * (1-porcentagem)) - 1
    return arr[index_inf:index_sup]

In [7]:
def plotHist(data, metric, path, title, ext):
    plt.hist(data, bins=np.unique(data), log=True)
    plt.title(title)
    plt.xlabel("Metric value")
    plt.ylabel("Frequency")
    plt.savefig(path+'/hist_'+metric+ ext+'.png')
    plt.close()

### Tabela com todos os dados

In [37]:
project_path = '../../Projetos/'
folder = os.listdir(project_path)
metrics = os.listdir(project_path + f'{folder[0]}/')
arr = []
for i in range(len(metrics)):
    metric = metrics[i].replace('.csv', '')
    data = getData(metric,project_path)
    values = [metric]
    values.extend(statistic_metric(data,0.6))
    arr.append(values)


abstractMethodsQty: 100%|██████████| 44/44 [00:21<00:00,  2.06it/s]
anonymousClassesQty: 100%|██████████| 44/44 [00:29<00:00,  1.50it/s]
assignmentsQty: 100%|██████████| 44/44 [00:26<00:00,  1.69it/s]
cbo: 100%|██████████| 44/44 [00:27<00:00,  1.61it/s]
comparisonsQty: 100%|██████████| 44/44 [01:46<00:00,  2.41s/it]
defaultFieldsQty: 100%|██████████| 44/44 [01:09<00:00,  1.59s/it]
defaultMethodsQty: 100%|██████████| 44/44 [01:23<00:00,  1.89s/it]
dit: 100%|██████████| 44/44 [00:45<00:00,  1.04s/it]
fanin: 100%|██████████| 44/44 [00:53<00:00,  1.21s/it]
fanout: 100%|██████████| 44/44 [02:33<00:00,  3.49s/it]
finalFieldsQty: 100%|██████████| 44/44 [02:19<00:00,  3.16s/it]
finalMethodsQty: 100%|██████████| 44/44 [00:56<00:00,  1.28s/it]
innerClassesQty: 100%|██████████| 44/44 [00:49<00:00,  1.12s/it]
lambdasQty: 100%|██████████| 44/44 [00:53<00:00,  1.21s/it]
lcc: 100%|██████████| 44/44 [01:14<00:00,  1.70s/it]
lcom: 100%|██████████| 44/44 [01:08<00:00,  1.55s/it]
loc: 100%|██████████| 44

In [38]:
df_data = pd.DataFrame(data=arr, columns=['metric', 'avg', 'std', 'median', 'mode', 'percentile70', 'percentile90', 'chebyshev'])
df_data

Unnamed: 0,metric,avg,std,median,mode,percentile70,percentile90,chebyshev
0,abstractMethodsQty,0.08069,0.835927,0.0,0.0,0.0,0.0,"(-1.2410270205896254, 1.402406021636085)"
1,anonymousClassesQty,0.189588,1.825292,0.0,0.0,0.0,0.0,"(-2.696452737186848, 3.0756281935594543)"
2,assignmentsQty,9.202378,32.831034,2.0,0.0,5.0,21.0,"(-42.708044272759885, 61.11279984972449)"
3,cbo,10.385757,32.535123,5.0,0.0,10.0,22.0,"(-41.05678970996809, 61.828304418537435)"
4,comparisonsQty,1.675236,9.543789,0.0,0.0,0.0,4.0,"(-13.414819790482754, 16.76529235829499)"
5,defaultFieldsQty,0.211576,3.070956,0.0,0.0,0.0,0.0,"(-4.644031348043464, 5.067183018226013)"
6,defaultMethodsQty,0.646554,3.844028,0.0,0.0,0.0,1.0,"(-5.431387731433958, 6.724496048295244)"
7,dit,1.72111,1.712809,1.0,1.0,2.0,3.0,"(-0.9870787026572885, 4.429299190008417)"
8,fanin,4.128229,28.923271,0.0,0.0,2.0,7.0,"(-41.6034787783144, 49.85993609771708)"
9,fanout,6.257529,12.837907,3.0,0.0,7.0,15.0,"(-14.040984150852532, 26.556041540019187)"


### Tabela com os dados > 1

In [45]:
project_path = '../../Projetos/'
folder = os.listdir(project_path)
metrics = os.listdir(project_path + f'{folder[0]}/')
arr = []
for i in range(len(metrics)):
    metric = metrics[i].replace('.csv', '')
    data = getData(metric,project_path)
    new_data = []
    for value in data:
        if value > 0:
            new_data.append(value)
    values = [metric]
    values.extend(statistic_metric(new_data,0.6))
    arr.append(values)

abstractMethodsQty: 100%|██████████| 44/44 [00:24<00:00,  1.79it/s]
anonymousClassesQty: 100%|██████████| 44/44 [00:24<00:00,  1.78it/s]
assignmentsQty: 100%|██████████| 44/44 [00:24<00:00,  1.79it/s]
cbo: 100%|██████████| 44/44 [00:39<00:00,  1.11it/s]
comparisonsQty: 100%|██████████| 44/44 [00:37<00:00,  1.18it/s]
defaultFieldsQty: 100%|██████████| 44/44 [01:05<00:00,  1.48s/it]
defaultMethodsQty: 100%|██████████| 44/44 [01:11<00:00,  1.64s/it]
dit: 100%|██████████| 44/44 [01:15<00:00,  1.72s/it]
fanin: 100%|██████████| 44/44 [00:44<00:00,  1.02s/it]
fanout: 100%|██████████| 44/44 [00:33<00:00,  1.33it/s]
finalFieldsQty: 100%|██████████| 44/44 [00:26<00:00,  1.66it/s]
finalMethodsQty: 100%|██████████| 44/44 [00:30<00:00,  1.45it/s]
innerClassesQty: 100%|██████████| 44/44 [00:37<00:00,  1.18it/s]
lambdasQty: 100%|██████████| 44/44 [00:35<00:00,  1.25it/s]
lcc: 100%|██████████| 44/44 [00:43<00:00,  1.02it/s]
lcom: 100%|██████████| 44/44 [01:05<00:00,  1.49s/it]
loc: 100%|██████████| 44

In [47]:
values

['wmc',
 10.923737434311935,
 50.50972142227971,
 4.0,
 1.0,
 8.0,
 23.0,
 (-68.93914440318974, 90.7866192718136)]

In [48]:
df_data1 = pd.DataFrame(data=arr, columns=['metric', 'avg', 'std', 'median', 'mode', 'percentile70', 'percentile90', 'chebyshev'])
df_data1

Unnamed: 0,metric,avg,std,median,mode,percentile70,percentile90,chebyshev
0,abstractMethodsQty,2.858761,4.100625,2.0,1.0,3.0,6.0,"(-3.6248962959034188, 9.342418732272405)"
1,anonymousClassesQty,2.79989,6.472626,1.0,1.0,2.0,5.0,"(-7.434230741729812, 13.034009964101944)"
2,assignmentsQty,14.583836,40.369864,5.0,1.0,10.0,32.0,"(-49.246524509976055, 78.41419587988234)"
3,cbo,13.605339,36.645273,8.0,3.0,13.0,26.0,"(-44.335925456360755, 71.54660294485531)"
4,comparisonsQty,6.285124,17.684837,2.0,1.0,5.0,13.0,"(-21.67705756469917, 34.24730652864769)"
5,defaultFieldsQty,2.9598,11.126354,2.0,1.0,3.0,5.0,"(-14.632510530038907, 20.552110855355508)"
6,defaultMethodsQty,3.215373,8.076226,1.0,1.0,2.0,6.0,"(-9.554261777741349, 15.985007497750438)"
7,dit,1.72111,1.712809,1.0,1.0,2.0,3.0,"(-0.9870787026572885, 4.429299190008417)"
8,fanin,9.256821,42.759247,3.0,1.0,5.0,15.0,"(-58.35148469117851, 76.86512604916871)"
9,fanout,8.523363,14.32401,6.0,2.0,9.0,18.0,"(-14.124884611846522, 31.171611046417635)"


### Tabela com dados aparados 5%

In [49]:
project_path = '../../Projetos/'
folder = os.listdir(project_path)
metrics = os.listdir(project_path + f'{folder[0]}/')
arr = []
for i in range(len(metrics)):
    metric = metrics[i].replace('.csv', '')
    data = getData(metric,project_path)
    new_data = aparar_arr(data, 0.05)
    values = [metric]
    values.extend(statistic_metric(new_data,0.6))
    arr.append(values)

abstractMethodsQty: 100%|██████████| 44/44 [00:42<00:00,  1.04it/s]
anonymousClassesQty: 100%|██████████| 44/44 [00:36<00:00,  1.22it/s]
assignmentsQty: 100%|██████████| 44/44 [01:22<00:00,  1.87s/it]
cbo: 100%|██████████| 44/44 [00:29<00:00,  1.50it/s]
comparisonsQty: 100%|██████████| 44/44 [00:25<00:00,  1.71it/s]
defaultFieldsQty: 100%|██████████| 44/44 [00:31<00:00,  1.39it/s]
defaultMethodsQty: 100%|██████████| 44/44 [00:50<00:00,  1.16s/it]
dit: 100%|██████████| 44/44 [02:12<00:00,  3.00s/it]
fanin: 100%|██████████| 44/44 [01:08<00:00,  1.56s/it]
fanout: 100%|██████████| 44/44 [00:29<00:00,  1.51it/s]
finalFieldsQty: 100%|██████████| 44/44 [00:35<00:00,  1.25it/s]
finalMethodsQty: 100%|██████████| 44/44 [00:37<00:00,  1.18it/s]
innerClassesQty: 100%|██████████| 44/44 [01:02<00:00,  1.41s/it]
lambdasQty: 100%|██████████| 44/44 [00:40<00:00,  1.09it/s]
lcc: 100%|██████████| 44/44 [00:32<00:00,  1.36it/s]
lcom: 100%|██████████| 44/44 [00:37<00:00,  1.18it/s]
loc: 100%|██████████| 44

In [50]:
df_data5 = pd.DataFrame(data=arr, columns=['metric', 'avg', 'std', 'median', 'mode', 'percentile70', 'percentile90', 'chebyshev'])
df_data5

Unnamed: 0,metric,avg,std,median,mode,percentile70,percentile90,chebyshev
0,abstractMethodsQty,0.0,0.0,0.0,0.0,0.0,0.0,"(0.0, 0.0)"
1,anonymousClassesQty,0.019674,0.138878,0.0,0.0,0.0,0.0,"(-0.1999115712265472, 0.23926006935786268)"
2,assignmentsQty,4.883057,7.490827,2.0,0.0,5.0,15.0,"(-6.960979844951196, 16.727094365322163)"
3,cbo,7.218092,7.24975,5.0,0.0,9.0,18.0,"(-4.244769332183674, 18.680952601689512)"
4,comparisonsQty,0.604868,1.375443,0.0,0.0,0.0,2.0,"(-1.5698980887061436, 2.779633704449234)"
5,defaultFieldsQty,0.023864,0.152625,0.0,0.0,0.0,0.0,"(-0.21745711826223604, 0.2651847055828654)"
6,defaultMethodsQty,0.22768,0.56267,0.0,0.0,0.0,1.0,"(-0.661979574345274, 1.1173387770398708)"
7,dit,1.504213,0.842605,1.0,1.0,2.0,3.0,"(0.1719371396152478, 2.836488089933038)"
8,fanin,1.444226,2.475302,0.0,0.0,1.0,5.0,"(-2.4695699591554288, 5.358021027304739)"
9,fanout,4.945301,5.107149,3.0,0.0,7.0,13.0,"(-3.1298111717112027, 13.020412380588606)"


### Tabela com dados aparados 10%

In [51]:
project_path = '../../Projetos/'
folder = os.listdir(project_path)
metrics = os.listdir(project_path + f'{folder[0]}/')
arr = []
for i in range(len(metrics)):
    metric = metrics[i].replace('.csv', '')
    data = getData(metric,project_path)
    new_data = aparar_arr(data, 0.1)
    values = [metric]
    values.extend(statistic_metric(new_data,0.6))
    arr.append(values)

abstractMethodsQty: 100%|██████████| 44/44 [00:31<00:00,  1.38it/s]
anonymousClassesQty: 100%|██████████| 44/44 [00:26<00:00,  1.65it/s]
assignmentsQty: 100%|██████████| 44/44 [00:26<00:00,  1.66it/s]
cbo: 100%|██████████| 44/44 [00:25<00:00,  1.70it/s]
comparisonsQty: 100%|██████████| 44/44 [00:26<00:00,  1.64it/s]
defaultFieldsQty: 100%|██████████| 44/44 [00:21<00:00,  2.05it/s]
defaultMethodsQty: 100%|██████████| 44/44 [00:24<00:00,  1.78it/s]
dit: 100%|██████████| 44/44 [00:27<00:00,  1.57it/s]
fanin: 100%|██████████| 44/44 [00:27<00:00,  1.62it/s]
fanout: 100%|██████████| 44/44 [00:32<00:00,  1.37it/s]
finalFieldsQty: 100%|██████████| 44/44 [00:25<00:00,  1.74it/s]
finalMethodsQty: 100%|██████████| 44/44 [00:28<00:00,  1.56it/s]
innerClassesQty: 100%|██████████| 44/44 [00:18<00:00,  2.35it/s]
lambdasQty: 100%|██████████| 44/44 [00:21<00:00,  2.06it/s]
lcc: 100%|██████████| 44/44 [00:24<00:00,  1.77it/s]
lcom: 100%|██████████| 44/44 [00:30<00:00,  1.43it/s]
loc: 100%|██████████| 44

In [52]:
df_data10 = pd.DataFrame(data=arr, columns=['metric', 'avg', 'std', 'median', 'mode', 'percentile70', 'percentile90', 'chebyshev'])
df_data10

Unnamed: 0,metric,avg,std,median,mode,percentile70,percentile90,chebyshev
0,abstractMethodsQty,0.0,0.0,0.0,0.0,0.0,0.0,"(0.0, 0.0)"
1,anonymousClassesQty,0.0,0.0,0.0,0.0,0.0,0.0,"(0.0, 0.0)"
2,assignmentsQty,3.707516,4.862352,2.0,0.0,4.0,11.0,"(-3.980536617932502, 11.395569060185291)"
3,cbo,6.469338,5.632936,5.0,0.0,9.0,15.0,"(-2.437116423327888, 15.375792779414054)"
4,comparisonsQty,0.352603,0.785381,0.0,0.0,0.0,1.0,"(-0.8891932701438121, 1.5943996028715566)"
5,defaultFieldsQty,0.0,0.0,0.0,0.0,0.0,0.0,"(0.0, 0.0)"
6,defaultMethodsQty,0.126348,0.332241,0.0,0.0,0.0,1.0,"(-0.39897161941958137, 0.6516682194714889)"
7,dit,1.393418,0.660188,1.0,1.0,1.0,2.0,"(0.34956985950404085, 2.437266372003875)"
8,fanin,1.039792,1.565162,0.0,0.0,1.0,3.0,"(-1.4349462151456462, 3.514530954309934)"
9,fanout,4.422419,4.028875,3.0,0.0,6.0,11.0,"(-1.9477920112073024, 10.792629799943352)"


In [54]:
df_data.to_csv('./Statistic_Data_Complete.csv', sep=';')
df_data1.to_csv('./Statistic_Data_Without_0.csv', sep=';')
df_data5.to_csv('./Statistic_Data_Percentage5.csv', sep=';')
df_data10.to_csv('./Statistic_Data_Percentage10.csv', sep=';')



### Histogramas de NOC

In [8]:
project_path = '../../Projetos/'
data = getData('noc',project_path)
project_path = './Hist/'
plotHist(data, 'noc', project_path,'Histograma NOC dataset completo', '')

noc: 100%|██████████| 44/44 [00:12<00:00,  3.40it/s]


In [13]:
data_0 = []
for value in data:
    if value > 0:
        data_0.append(value)
plotHist(data_0, 'noc', project_path, 'Histograma NOC dataset > 0', '0')

In [14]:
data_5 = aparar_arr(data, 0.05)
plotHist(data_5, 'noc', project_path,'Histograma NOC dataset 5% aparado', '5')

In [15]:
data_10 = aparar_arr(data, 0.1)
plotHist(data_10, 'noc', project_path, 'Histograma NOC dataset 10% aparado', '10')

### Tabela com dados
Sem o valor 0, caso a métrica seja discreta e o percentil 70 tenha sido 0 também.

In [5]:
metrics_without_zero = ['abstractMethodsQty', 'anonymousClassesQty', 'comparisonsQty','defaultFieldsQty', 'defaultMethodsQty','finalMethodsQty', 'innerClassesQty', 'lambdasQty', 'logStatementsQty', 'loopQty', 'mathOperationsQty', 'noc', 'parenthesizedExpsQty', 'privateMethodsQty', 'protectedFieldsQty', 'protectedMethodsQty', 'publicFieldsQty', 'staticFieldsQty', 'staticMethodsQty', 'synchronizedFieldsQty', 'synchronizedMethodsQty', 'tryCatchQty']

In [3]:
def statistic_metric(data, porcentagem):
    if (len(data) == 0) :
        return [0,0,0,0,0,0,(0,0)]
    metric_dict = []
    avg = np.average(data)
    median = np.median(data)
    mode = statistics.mode(data)
    std = np.std(data)
    x = porcentagem
    k = sqrt(-1/(x - 1))
    limite_inf = avg - k*std
    limite_sup = avg + k*std
    
    percentile70 = np.percentile(data, 70)
    percentile90 = np.percentile(data, 90)
    
    metric_dict = [
        avg,
        std,
        median,
        mode,
        percentile70,
        percentile90
    ]
    
    return metric_dict

In [6]:
project_path = '../../Projetos/'
folder = os.listdir(project_path)
metrics = os.listdir(project_path + f'{folder[0]}/')
arr = []
for i in range(len(metrics)):
    metric = metrics[i].replace('.csv', '')
    data = getData(metric,project_path)
    if (metric in metrics_without_zero):
        new_data = []
        for value in data:
            if value > 0:
                new_data.append(value)
        data = new_data
    values = [metric]
    values.extend(statistic_metric(data,0.6))
    arr.append(values)

abstractMethodsQty: 100%|██████████| 44/44 [00:48<00:00,  1.10s/it]
anonymousClassesQty: 100%|██████████| 44/44 [00:38<00:00,  1.13it/s]
assignmentsQty: 100%|██████████| 44/44 [00:45<00:00,  1.03s/it]
cbo: 100%|██████████| 44/44 [01:36<00:00,  2.19s/it]
comparisonsQty: 100%|██████████| 44/44 [00:43<00:00,  1.01it/s]
defaultFieldsQty: 100%|██████████| 44/44 [00:38<00:00,  1.14it/s]
defaultMethodsQty: 100%|██████████| 44/44 [00:43<00:00,  1.01it/s]
dit: 100%|██████████| 44/44 [00:44<00:00,  1.02s/it]
fanin: 100%|██████████| 44/44 [00:36<00:00,  1.19it/s]
fanout: 100%|██████████| 44/44 [00:39<00:00,  1.12it/s]
finalFieldsQty: 100%|██████████| 44/44 [00:38<00:00,  1.15it/s]
finalMethodsQty: 100%|██████████| 44/44 [00:35<00:00,  1.24it/s]
innerClassesQty: 100%|██████████| 44/44 [00:35<00:00,  1.23it/s]
lambdasQty: 100%|██████████| 44/44 [00:39<00:00,  1.12it/s]
lcc: 100%|██████████| 44/44 [01:02<00:00,  1.42s/it]
lcom: 100%|██████████| 44/44 [01:08<00:00,  1.56s/it]
loc: 100%|██████████| 44

In [7]:
df = pd.DataFrame(data= arr, columns=['metric', 'avg', 'std', 'median', 'mode', 'percentile70', 'percentile90', 'chebyshev'])
df

Unnamed: 0,metric,avg,std,median,mode,percentile70,percentile90,chebyshev
0,abstractMethodsQty,2.858761,4.100625,2.0,1.0,3.0,6.0,
1,anonymousClassesQty,2.79989,6.472626,1.0,1.0,2.0,5.0,
2,assignmentsQty,9.202378,32.831034,2.0,0.0,5.0,21.0,
3,cbo,10.385757,32.535123,5.0,0.0,10.0,22.0,
4,comparisonsQty,6.285124,17.684837,2.0,1.0,5.0,13.0,
5,defaultFieldsQty,2.9598,11.126354,2.0,1.0,3.0,5.0,
6,defaultMethodsQty,3.215373,8.076226,1.0,1.0,2.0,6.0,
7,dit,1.72111,1.712809,1.0,1.0,2.0,3.0,
8,fanin,4.128229,28.923271,0.0,0.0,2.0,7.0,
9,fanout,6.257529,12.837907,3.0,0.0,7.0,15.0,


In [12]:
df.to_csv('./Statistic_Data_New_Approach.csv', sep=';', decimal=",")