# Analytics

#### Date: 2020/02

#### SUMMARY:

- This notebook represents the project quality analysis of the date exposed right above. 

### TEAM: SiGeD

##### Semester: 2020/02
##### Professor: Hilmer Neri

##### Members:

- Vitor Leal
- Gabriel Carvalho
- Ezequiel de Oliveira
- Gabriela Guedes
- Davi Marinho da Silva Campos
- João Pedro Alves da Silva Chaves
- Thiago França Vale Oliveira
- Lucas da Cunha Andrade
- Dafne Moretti Moreira
- Victor Yukio Cavalcanti Miki

### LIBRARIES

In [8]:
# Deal with data
import pandas as pd
import numpy as np
import json
from glob import glob
import os

# Deal with API request
import urllib3
from urllib3 import request

# Deal with visualization
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'pandas'

In [7]:
!pip install --upgrade pandas

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m
Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: pandas in /Users/gabibguedes/Library/Python/2.7/lib/python/site-packages (0.24.2)


### GRAPH SETTINGS

In [None]:
%config InlineBackend.figure_format ='retina'
sns.set(font_scale=1.5)
sns.set_style('darkgrid',
              {'xtick.bottom' : True,
               'ytick.left': True,
               'grid.linestyle':'--',
               'font.monospace': ['Computer Modern Typewriter'],
               'axes.edgecolor' : 'white'})

### DATAFRAME SETTINGS

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

### SonarCloud

##### Path to the folder with all your jsons

In [None]:
jsons = glob('data/release_jsons/*.json') # add the path here

In [None]:
def read_json(json_path):
    
    with open(json_path) as json_file:
        json_obj = json.load(json_file)
        
    return json_obj

def create_base_component_df(json_list):
    
    df = pd.DataFrame()

    for i in json_list:

        base_component = read_json(i)

        base_component_data = base_component['baseComponent']['measures']

        base_component_df = pd.DataFrame(base_component_data)

        base_component_df['filename'] = os.path.basename(i)

        df = df.append(base_component_df, ignore_index=True)
        
    aux_df = df['filename'].str.split(r"SiGeD-(.*?)-date_(.*?).json", expand=True)
    
    df['repository'] = aux_df[1]
    
    df['version'] = aux_df[2]
    
    df = df.sort_values(by=['repository', 'version'])
        
    return df

#### Create base component dataframe

In [None]:
base_component_df = create_base_component_df(jsons)

In [None]:
base_component_df.head(10)

#### Create dataframe per file

In [None]:
metric_list = ['files',
               'functions',
               'complexity',
               'comment_lines_density',
               'duplicated_lines_density',
               'coverage',
               'ncloc',
               'security_rating',
               'tests',
               'test_success_density',
               'test_execution_time',
               'reliability_rating']

len(metric_list)

In [None]:
def metric_per_file(json):
    
    file_json = []
    
    for component in json['components']:
        if component['qualifier'] == 'FIL':
            file_json.append(component)
            
    return file_json

def generate_file_dataframe_per_release(metric_list, json, language_extension):
    
    df_columns = metric_list
    df = pd.DataFrame(columns = df_columns)
    
    for file in json:
        try:
            if file['language'] == language_extension:
                for measure in file['measures']:
                    df.at[file['path'], measure['metric']] = measure['value']
        except:
            pass
        
    df.reset_index(inplace = True)
    df = df.rename({'index': 'path'}, axis=1).drop(['files'], axis=1)

    return df

def create_file_df(json_list):
    
    df = pd.DataFrame()

    for i in json_list:

        file_component = read_json(i)
        
        file_component_data = metric_per_file(file_component)

        file_component_df = generate_file_dataframe_per_release(metric_list, file_component_data, language_extension = 'js')

        file_component_df['filename'] = os.path.basename(i)

        df = df.append(file_component_df, ignore_index=True)
        
    # replace TeamName by yours.    
    aux_df = df['filename'].str.split(r"TeamName-(.*?)-date_(.*?).json", expand=True)
    
    df['repository'] = aux_df[1]
    
    df['version'] = aux_df[2]
    
    df = df.sort_values(by=['repository', 'version'])
        
    return df

In [None]:
file_component_df = create_file_df(jsons)

In [None]:
file_component_df.head(10)

In [None]:
file_component_df.to_excel('data/data.xlsx', index = False)

#### Create dataframe per repository

In [None]:
# Example

repo1_df = file_component_df[file_component_df['repository'] == 'repo1']
repo2_df = file_component_df[file_component_df['repository'] == 'repo2']
...

### Metric calculations

##### COMPLEXITY

In [None]:
def m1(df):
    
    density_non_complex_files = len(df[(df['complexity'].astype(float)/df['functions'].astype(float)) < 10])/len(df)
    
    return density_non_complex_files

##### COMMENTS

In [None]:
def m2(df):
    
    density_comment_files = len(df[(df['comment_lines_density'].astype(float) > 10) & (df['comment_lines_density'].astype(float) < 30)])/len(df)
    
    return density_comment_files

##### DUPLICATIONS

In [None]:
def m3(df):
    
    duplication = len(df[(df['duplicated_lines_density'].astype(float) < 5)])/len(df)
    
    return duplication

### Calculate m1, m2 and m3 for each repository

In [None]:
def create_metrics_df(df):
    
    version_vec = df['version'].unique()
    
    m1_list = []
    m2_list = []
    m3_list = []
    repository_list = []
    version_list = []
    
    metrics_df = pd.DataFrame()
    
    for version in version_vec:

        version_df = df[df['version'] == version]

        m1_list.append(m1(version_df))
        m2_list.append(m2(version_df))
        m3_list.append(m3(version_df))
        repository_list.append(version_df['repository'].iloc[0])
        version_list.append(version)
        
    metrics_df = pd.DataFrame({'m1': m1_list,
                               'm2': m2_list,
                               'm3': m3_list,
                               'repository': repository_list, 
                               'version': version_list})
        
    return metrics_df

In [None]:
repo1_metrics = create_metrics_df(repo1_df)
repo2_metrics = create_metrics_df(repo2_df)
...

### Data visualization

- You must do this for each of your repositories

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m1'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m2'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m3'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m1'], linewidth=3, marker='o', markersize=10)
plt.plot(repo1['m2'], linewidth=3, marker='o', markersize=10)
plt.plot(repo1['m3'], linewidth=3, marker='o', markersize=10)

### Sub characteristic aggregation

- You must do this for each of your repositories

In [None]:
psc1 = 1
pm1 = 0.33
pm2 = 0.33
pm3 = 0.33

repo1['asc1'] = ((repo1['m1']*pm1)+(repo1['m2']*pm2)+(repo1['m3']*pm3))*psc1
repo2['asc1'] = ((repo2['m1']*pm1)+(repo2['m2']*pm2)+(repo2['m3']*pm3))*psc1
...

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['asc1'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo2['asc1'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))


plt.plot(repo1['asc1'], linewidth=3, marker='o', markersize=5)
plt.plot(repo2['asc1'], linewidth=3, marker='o', markersize=5)
...

In [None]:
metrics_df = pd.concat([repo1_metrics, repo2_metrics, ...], ignore_index=True)

metrics_df['ac1'] = metrics_df['asc1'] * 1
metrics_df['total'] = metrics_df['asc1'] * 1

In [None]:
metrics_df

In [None]:
metrics_df.to_excel('data/metrics_df.xlsx', index = False)