# Analytics

#### Date: 2021/01

#### SUMMARY:

- This notebook represents the project quality analysis of the date exposed right above. 

### TEAM:

##### Semester: 2021/01
##### Professor: Hilmer Neri

##### Members:

- Érico Maximiano Bandeira
- Henrique Martins de Messias
- João Vitor Moura Rosa
- Max Henrique Barbosa
- Victor Rodrigues Silva
- Antonio Igor Carvalho
- Gabriel Santos Silva Araújo
- João Paulo Lima da Silva
- Lucas Vieira de Jesus

### LIBRARIES

In [None]:
# Deal with data
import pandas as pd
import numpy as np
import json
from glob import glob
import os

# Deal with API request
import urllib3
from urllib3 import request

# Deal with visualization
import seaborn as sns
import matplotlib.pyplot as plt

### GRAPH SETTINGS

In [None]:
%config InlineBackend.figure_format ='retina'
sns.set(font_scale=1.5)
sns.set_style('darkgrid',
              {'xtick.bottom' : True,
               'ytick.left': True,
               'grid.linestyle':'--',
               'font.monospace': ['Computer Modern Typewriter'],
               'axes.edgecolor' : 'white'})

### DATAFRAME SETTINGS

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

### SonarCloud

##### Path to the folder with all your jsons

In [None]:
jsons = glob('../analytics-raw-data/*.json') # add the path here

In [None]:
def read_json(json_path):
    
    with open(json_path) as json_file:
        json_obj = json.load(json_file)
        
    return json_obj

def create_base_component_df(json_list):
    
    df = pd.DataFrame()

    for i in json_list:
        
        if i.endswith("-test.json"):
            continue

        base_component = read_json(i)
        
        base_component_data = base_component['baseComponent']['measures']

        base_component_df = pd.DataFrame(base_component_data)

        base_component_df['filename'] = os.path.basename(i)

        df = df.append(base_component_df, ignore_index=True)

    aux_df = df['filename'].str.split(r"Oraculo-(.*?)-*-(.*?).json", expand=True)

    df['repository'] = aux_df[1]

    df['version'] = aux_df[2]

    df = df.sort_values(by=['repository', 'version'])
        
    return df

#### Create base component dataframe

In [None]:
base_component_df = create_base_component_df(jsons)

In [None]:
base_component_df.head(10)

#### Create dataframe per file

In [None]:
metric_list = ['files',
               'functions',
               'complexity',
               'comment_lines_density',
               'duplicated_lines_density',
               'coverage',
               'ncloc',
               'security_rating',
               'tests',
               'test_success_density',
               'test_execution_time',
               'reliability_rating']

len(metric_list)

In [None]:
def metric_per_file(json):
    
    file_json = []
    
    for component in json['components']:
        if component['qualifier'] == 'FIL':
            file_json.append(component)
            
    return file_json

def generate_file_dataframe_per_release(metric_list, json, language_extension):
    
    df_columns = metric_list
    df = pd.DataFrame(columns = df_columns)
    
    for file in json:
        try:
            if file['language'] == language_extension:
                for measure in file['measures']:
                    df.at[file['path'], measure['metric']] = measure['value']
        except:
            pass
        
    df.reset_index(inplace = True)
    df = df.rename({'index': 'path'}, axis=1).drop(['files'], axis=1)

    return df

def create_file_df(json_list):
    
    df = pd.DataFrame()

    for i in json_list:
        
        if i.endswith("-test.json"):
            continue

        file_component = read_json(i)
        
        file_component_data = metric_per_file(file_component)

        file_component_df = generate_file_dataframe_per_release(metric_list, file_component_data, language_extension = 'js')
        
        test_file = i.replace(".json", "-test.json")
        
        test_json = read_json(test_file)

        file_component_df['tests'] = 1
        file_component_df['test_execution_time'] = test_json['test_execution_time']

        file_component_df['filename'] = os.path.basename(i)

        df = df.append(file_component_df, ignore_index=True)
          
    aux_df = df['filename'].str.split(r"Oraculo-(.*?)-*-(.*?).json", expand=True)
    
    df['repository'] = aux_df[1]
    
    df['version'] = aux_df[2]
    
    df = df.sort_values(by=['repository', 'version'])
        
    return df

In [None]:
file_component_df = create_file_df(jsons)

In [None]:
file_component_df.head(10)

#### Create dataframe per repository

In [None]:
# Example

frontEnd_df = file_component_df[file_component_df['repository'] == 'FrontEnd']
tags_df = file_component_df[file_component_df['repository'] == 'Tags']
profile_df = file_component_df[file_component_df['repository'] == 'Profile']
processos_df = file_component_df[file_component_df['repository'] == 'Registros']

In [None]:
def _ncloc(df):
    ncloc = 0
    for each in df['ncloc']:
        ncloc += int(each)

    return ncloc

### Measure calculations according Q-Rapids quality model

# Quality Aspect - Maintainability
## Factor - Code Quality

##### COMPLEXITY

In [None]:
def m1(df):
    
    density_non_complex_files = len(df[(df['complexity'].astype(float)/df['functions'].astype(float)) < 10])/len(df)
    
    return density_non_complex_files

##### COMMENTS

In [None]:
def m2(df):
    
    density_comment_files = len(df[(df['comment_lines_density'].astype(float) > 10) & (df['comment_lines_density'].astype(float) < 30)])/len(df)
    
    return density_comment_files

##### DUPLICATIONS

In [None]:
def m3(df):
    
    duplication = len(df[(df['duplicated_lines_density'].astype(float) < 5)])/len(df)
    
    return duplication

# Quality Aspect - Reliability
## Factor - Testing Status

##### Passed tests

In [None]:
def m4(df):

    passed_tests = df['test_success_density'].astype(float).median() / 100

    return passed_tests

##### Fast test builds

In [None]:
def m5(df):

    density_fast_test_builds = len(df[(df['test_execution_time'].astype(float)) < 300]) / sum(df['tests'].astype(float))

    return density_fast_test_builds

##### Test coverage

In [None]:
def m6(df):

    density_test_coverage = len(df[(df['coverage'].astype(float) > 60)]) / len(df)

    return density_test_coverage

### Calculate m1, m2, m3, m4, m5 and m6 for each repository

In [None]:
def create_metrics_df(df):
    
    version_vec = df['version'].unique()
    
    m1_list = []
    m2_list = []
    m3_list = []
    m4_list = []
    m5_list = []
    m6_list = []

    ncloc_list = []
    repository_list = []
    version_list = []
    
    metrics_df = pd.DataFrame()
    
    for version in version_vec:

        version_df = df[df['version'] == version]

        m1_list.append(m1(version_df))
        m2_list.append(m2(version_df))
        m3_list.append(m3(version_df))
        m4_list.append(m4(version_df))
        m5_list.append(m5(version_df))
        m6_list.append(m6(version_df))

        ncloc_list.append(_ncloc(version_df))
        repository_list.append(version_df['repository'].iloc[0])
        version_list.append(version)
        
    metrics_df = pd.DataFrame({'m1': m1_list,
                               'm2': m2_list,
                               'm3': m3_list,
                               'm4': m4_list,
                               'm5': m5_list,
                               'm6': m6_list,
                               'repository': repository_list, 
                               'version': version_list,
                               'ncloc': ncloc_list})
        
    return metrics_df

In [None]:
frontEnd_metrics = create_metrics_df(frontEnd_df)
tags_metrics = create_metrics_df(tags_df)
profile_metrics = create_metrics_df(profile_df)
processos_metrics = create_metrics_df(processos_df)

### Data visualization

- You must do this for each of your repositories

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(frontEnd_metrics['m1'], label='Complexity', linewidth=3, marker='o', markersize=10)
plt.plot(frontEnd_metrics['m2'], label='Comments', linewidth=3, marker='o', markersize=10)
plt.plot(frontEnd_metrics['m3'], label='Few duplications', linewidth=3, marker='o', markersize=10)
plt.plot(frontEnd_metrics['m4'], label='Passed tests', linewidth=3, marker='o', markersize=10)
plt.plot(frontEnd_metrics['m5'], label='Fast test builds', linewidth=3, marker='o', markersize=10)
plt.plot(frontEnd_metrics['m6'], label='Test coverage', linewidth=3, marker='o', markersize=10)

plt.legend(loc='upper left')

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(tags_metrics['m1'], label='Complexity', linewidth=3, marker='o', markersize=10)
plt.plot(tags_metrics['m2'], label='Comments', linewidth=3, marker='o', markersize=10)
plt.plot(tags_metrics['m3'], label='Few duplications', linewidth=3, marker='o', markersize=10)
plt.plot(tags_metrics['m4'], label='Passed tests', linewidth=3, marker='o', markersize=10)
plt.plot(tags_metrics['m5'], label='Fast test builds', linewidth=3, marker='o', markersize=10)
plt.plot(tags_metrics['m6'], label='Test coverage', linewidth=3, marker='o', markersize=10)

plt.legend(loc='upper left')

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(profile_metrics['m1'], label='Complexity', linewidth=3, marker='o', markersize=10)
plt.plot(profile_metrics['m2'], label='Comments', linewidth=3, marker='o', markersize=10)
plt.plot(profile_metrics['m3'], label='Few duplications', linewidth=3, marker='o', markersize=10)
plt.plot(profile_metrics['m4'], label='Passed tests', linewidth=3, marker='o', markersize=10)
plt.plot(profile_metrics['m5'], label='Fast test builds', linewidth=3, marker='o', markersize=10)
plt.plot(profile_metrics['m6'], label='Test coverage', linewidth=3, marker='o', markersize=10)

plt.legend(loc='upper left')

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(processos_metrics['m1'], label='Complexity', linewidth=3, marker='o', markersize=10)
plt.plot(processos_metrics['m2'], label='Comments', linewidth=3, marker='o', markersize=10)
plt.plot(processos_metrics['m3'], label='Few duplications', linewidth=3, marker='o', markersize=10)
plt.plot(processos_metrics['m4'], label='Passed tests', linewidth=3, marker='o', markersize=10)
plt.plot(processos_metrics['m5'], label='Fast test builds', linewidth=3, marker='o', markersize=10)
plt.plot(processos_metrics['m6'], label='Test coverage', linewidth=3, marker='o', markersize=10)

plt.legend(loc='upper left')

### Sub characteristic aggregation

- You must do this for each of your repositories

In [None]:
psc1, psc2 = 1, 1
pc1, pc2 = 0.5, 0.5
pm1 = 0.33
pm2 = 0.33
pm3 = 0.33
pm4 = 0.15
pm5 = 0.15
pm6 = 0.7

frontEnd_metrics['code_quality'] = ((frontEnd_metrics['m1']*pm1) + (frontEnd_metrics['m2']*pm2) + (frontEnd_metrics['m3']*pm3)) * psc1
tags_metrics['code_quality'] = ((tags_metrics['m1']*pm1) + (tags_metrics['m2']*pm2) + (tags_metrics['m3']*pm3)) * psc1
profile_metrics['code_quality'] = ((profile_metrics['m1']*pm1) + (profile_metrics['m2']*pm2) + (profile_metrics['m3']*pm3)) * psc1
processos_metrics['code_quality'] = ((processos_metrics['m1']*pm1) + (processos_metrics['m2']*pm2) + (processos_metrics['m3']*pm3)) * psc1

frontEnd_metrics['testing_status'] = ((frontEnd_metrics['m4']*pm4) + (frontEnd_metrics['m5']*pm5) + (frontEnd_metrics['m6']*pm6)) * psc2
tags_metrics['testing_status'] = ((tags_metrics['m4']*pm4) + (tags_metrics['m5']*pm5) + (tags_metrics['m6']*pm6)) * psc2
profile_metrics['testing_status'] = ((profile_metrics['m4']*pm4) + (profile_metrics['m5']*pm5) + (profile_metrics['m6']*pm6)) * psc2
processos_metrics['testing_status'] = ((processos_metrics['m4']*pm4) + (processos_metrics['m5']*pm5) + (processos_metrics['m6']*pm6)) * psc2

In [None]:
fig = plt.figure(figsize=(20, 10))


plt.plot(frontEnd_metrics['code_quality'], label='front', linewidth=3, marker='o', markersize=5)
plt.plot(tags_metrics['code_quality'], label='tags', linewidth=3, marker='o', markersize=5)
plt.plot(profile_metrics['code_quality'], label='profile', linewidth=3, marker='o', markersize=5)
plt.plot(processos_metrics['code_quality'], label='registros', linewidth=3, marker='o', markersize=5)

plt.legend(loc='upper left')

In [None]:
fig = plt.figure(figsize=(20, 10))


plt.plot(frontEnd_metrics['testing_status'], label='front', linewidth=3, marker='o', markersize=5)
plt.plot(tags_metrics['testing_status'], label='tags', linewidth=3, marker='o', markersize=5)
plt.plot(profile_metrics['testing_status'], label='profile', linewidth=3, marker='o', markersize=5)
plt.plot(processos_metrics['testing_status'], label='registros', linewidth=3, marker='o', markersize=5)

plt.legend(loc='upper left')

In [None]:
metrics_df = pd.concat([
frontEnd_metrics,
tags_metrics,
profile_metrics,
processos_metrics
], ignore_index=True)

metrics_df['maintainability'] = metrics_df['code_quality'] * pc1
metrics_df['Reliability'] = metrics_df['testing_status'] * pc2
metrics_df['total'] = metrics_df['maintainability'] + metrics_df['Reliability']

In [None]:
metrics_df

In [None]:
fig = plt.figure(figsize=(20, 10))


plt.plot(metrics_df['maintainability'], label='maintainability', linewidth=3, marker='o', markersize=5)
plt.plot(metrics_df['Reliability'], label='Reliability', linewidth=3, marker='o', markersize=5)

plt.legend(loc='upper left')

#plt.ylim(.45,.7)

In [None]:
fig = plt.figure(figsize=(20, 10))


plt.plot(metrics_df['total'], linewidth=3, marker='o', markersize=5)


#plt.ylim(.45,.7)


In [None]:
# ANALYSIS_DATE FORMAT: MM-DD-YYYY
from datetime import datetime

now = datetime.now()
now_str = now.strftime("%m-%d-%Y")

df = metrics_df[[
    'm1',
    'm2',
    'm3',
    'm4',
    'm5',
    'm6',
    'maintainability',
    'Reliability',
    'total',
    'ncloc',
    'repository',
    'version'
]].copy()
df['data'] = now_str

#df.to_excel('data/fga-eps-mds-2021_1-Oraculo-DATASET-{}.xlsx'.format(now_str), index = False)
#df.to_csv('data/fga-eps-mds-2021_1-Oraculo-DATASET-{}.csv'.format(now_str), index = False)

## Análise

### Funções Auxiliares

In [None]:
def element_or_list(x):
    return x[0] if len(x) == 1 else x

### Funções

In [None]:
def descriptive_statistics(**kwargs):

    data = {
        'média': [kwargs[key].mean() for key in kwargs],
        'moda': [element_or_list(kwargs[key].mode().values.tolist()) for key in kwargs],
        'mediana': [kwargs[key].median() for key in kwargs],
        'desvio padrão': [kwargs[key].std() for key in kwargs],
        'variância': [kwargs[key].var() for key in kwargs],
        'mínimo': [kwargs[key].min() for key in kwargs],
        'máximo': [kwargs[key].max() for key in kwargs]
    }
    
    return pd.DataFrame.from_dict(data, orient='index', columns=[key for key in kwargs])

In [None]:
def percentile(**kwargs):
    
    data = {
        '5th' : [kwargs[key].quantile(0.05) for key in kwargs],
        '10th': [kwargs[key].quantile(0.10) for key in kwargs],
        '25th': [kwargs[key].quantile(0.25) for key in kwargs],
        '50th': [kwargs[key].quantile(0.50) for key in kwargs],
        '75th': [kwargs[key].quantile(0.75) for key in kwargs],
        '90th': [kwargs[key].quantile(0.90) for key in kwargs],
        '95th': [kwargs[key].quantile(0.95) for key in kwargs],
        '99th': [kwargs[key].quantile(0.99) for key in kwargs],
    }
    
    return pd.DataFrame.from_dict(data, orient='index', columns=[key for key in kwargs])

In [None]:
def correlation_matrix(**kwargs):
    
    data = {k:v for k,v in kwargs.items()}

    df = pd.DataFrame.from_dict(data)
    return df.corr()

In [None]:
def box_plot(maintainability, reliability):
    data = [
        maintainability,
        reliability
    ]
    
    fig, ax = plt.subplots()
    ax.boxplot(data)

    plt.show()

In [None]:
def scatter(frontEnd, profile, registros, tags):
    fig = plt.figure(figsize=(12, 8))
    
    ax1 = frontEnd.plot(kind='scatter', x='code_quality', y='testing_status', color='r', label='front')
    ax2 = profile.plot(kind='scatter', x='code_quality', y='testing_status', color='g', label='profile', ax=ax1)
    ax3 = registros.plot(kind='scatter', x='code_quality', y='testing_status', color='b', label='registros', ax=ax1)
    ax4 = tags.plot(kind='scatter', x='code_quality', y='testing_status', color='black', label='tags', ax=ax1)
    plt.show()

In [None]:
def histogram(total):
    plt.hist(total, color='limegreen')
    plt.axvline(total.mean(), linestyle='dashed', color='black')
    plt.show()

### FrontEnd

In [None]:
df = frontEnd_metrics.copy()

#### Maintainability

In [None]:
descriptive_statistics(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3']
)

In [None]:
percentile(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3']
)

#### Reliability

In [None]:
descriptive_statistics(
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6']
)

In [None]:
percentile(
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6']
)

#### Correlation Matrix

In [None]:
correlation_matrix(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3'],
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6'],
)

#### Box-Plot

In [None]:
box_plot(df['code_quality'], df['testing_status'])

### Profile

In [None]:
df = profile_metrics.copy()

#### Maintainability

In [None]:
descriptive_statistics(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3']
)

In [None]:
percentile(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3']
)

#### Reliability

In [None]:
descriptive_statistics(
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6']
)

In [None]:
percentile(
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6']
)

#### Correlation Matrix

In [None]:
correlation_matrix(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3'],
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6'],
)

#### Box-Plot

In [None]:
box_plot(df['code_quality'], df['testing_status'])

### Registros

In [None]:
df = processos_metrics.copy()

#### Maintainability

In [None]:
descriptive_statistics(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3']
)

In [None]:
percentile(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3']
)

#### Reliability

In [None]:
descriptive_statistics(
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6']
)

In [None]:
percentile(
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6']
)

#### Correlation Matrix

In [None]:
correlation_matrix(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3'],
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6'],
)

#### Box-Plot

In [None]:
box_plot(df['code_quality'], df['testing_status'])

### Tags

In [None]:
df = tags_metrics.copy()

#### Maintainability

In [None]:
descriptive_statistics(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3']
)

In [None]:
percentile(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3']
)

#### Reliability

In [None]:
descriptive_statistics(
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6']
)

In [None]:
percentile(
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6']
)

#### Correlation Matrix

In [None]:
correlation_matrix(
    complexity=df['m1'],
    comments=df['m2'],
    duplications=df['m3'],
    passed_tests=df['m4'],
    fast_tests=df['m5'],
    test_coverage=df['m6'],
)

#### Box-Plot

In [None]:
box_plot(df['code_quality'], df['testing_status'])

### Total

In [None]:
scatter(
    frontEnd_metrics,
    profile_metrics,
    processos_metrics,
    tags_metrics,
)