In [194]:
import more_itertools as mit
from strip_ansi import strip_ansi
import pandas as pd
import glob

## Generate Cyclomatic Complexity data

In [200]:
# convert the release date to datetime
def convert_release_date_to_datetime():
    raw_final_df['date_time'] = pd.to_datetime(raw_final_df["release_date"], format='%a %b %d %H:%M:%S %Y')
    raw_final_df.set_index('date_time', inplace=True)
    raw_final_df.info()


def save_data_to_file(dataframe, file_path): 
    # save the metrics 
    dataframe.to_csv(file_path) 


def get_cc_metrics_dict(line): 
    line = line.split(" ")
    block_type = line[0]
    name = line[2] 
    cc_score = line[4]
    return {"cc_score": cc_score, "block_name": name, "block_type": block_type  } 
    
def get_release_version_date(file_name): 
    tmp_lst = file_name.split("_")[2:7]
    version_date = " ".join(tmp_lst)
    return version_date
    

def get_release_version_name(file_name): 
    tmp_lst = file_name.split("_")[1]
    version_date = "".join(tmp_lst)
    return version_date   


def convert_cc_metrics_to_dataframe(all_metrics_list):
    df = pd.DataFrame(all_metrics_list)
    return df

def convert_mi_metrics_to_dataframe(all_metrics_list): 
    df = pd.DataFrame(all_metrics_list)
    return df

In [196]:
def create_dataframe(file_name,template_metrics_dict): 
    release_version_date = get_release_version_date(file_name)
    cnt = 0 
    metric_code = "cyclomatic_complexity"
    all_metrics_list = [] 
    FIRST = True
    with open(file_name, 'r', encoding="utf-8") as f:
        release_version_date = get_release_version_date(file_name)
        version_name =  get_release_version_name(file_name)
        start = True
        lines = f.readlines()
        for line in lines: 
            if "../" in line:
                if FIRST: 
                    metrics_dict = {}
                    FIRST = False
                else:     
                    all_metrics_list.append(metrics_dict)
                    metrics_dict = {}
                # create a new dict for this files
                metrics_dict = {} 
                metrics_dict.update(template_metrics_dict)
                metrics_dict["parent_file"] = line.replace("\n","") 
                metrics_dict["release_date"] = release_version_date
                metrics_dict["version_name"] = version_name 
                
            else: 
                # just append the other lines to
                # this list 
                line = strip_ansi(line).strip()
                metrics_dict.update(get_cc_metrics_dict(line))
                metrics_dict["metrics_code"] = metric_code
                
            
    cc_metrics_df = convert_cc_metrics_to_dataframe(all_metrics_list) 
    return cc_metrics_df



 
           


In [239]:
folder_path = "cc_*.txt"
df_lst = [] 
template_metrics_dict = {} 
for file_name in glob.glob(folder_path):
    df = create_dataframe(file_name,template_metrics_dict)
    df_lst.append(df)
    

cc_final_df = pd.concat(df_lst) 


In [240]:
cc_final_df['date_time'] = pd.to_datetime(cc_final_df["release_date"], format='%a %b %d %H:%M:%S %Y')
cc_final_df.set_index('date_time', inplace=True)
cc_final_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 235944 entries, 2022-05-24 17:21:45 to 2022-08-02 19:50:01
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   parent_file   235944 non-null  object
 1   release_date  235944 non-null  object
 2   version_name  235944 non-null  object
 3   cc_score      235944 non-null  object
 4   block_name    235944 non-null  object
 5   block_type    235944 non-null  object
 6   metrics_code  235944 non-null  object
dtypes: object(7)
memory usage: 14.4+ MB


In [241]:
# save the metrics 
cc_metrics_file_path="cyclomatic_complexity_dataset.csv"
cc_final_df.to_csv(cc_metrics_file_path) 

## Generate Maintainability Index data

In [176]:
def create_mi_dataframe(file_name,metrics_dict): 
    release_version_date = get_release_version_date(file_name)
    cnt = 0 
    metric_code = "maintainability_index"
    all_metrics_list = [] 
    
    with open(file_name, 'r', encoding="utf-8") as f:
        release_version_date = get_release_version_date(file_name)
        version_name =  get_release_version_name(file_name)
        start = True
        lines = f.readlines()
        for line in lines: 
            line = strip_ansi(line).strip()
            parent_file = line.split("-")[0]
            mi_score = line.split("-")[1]
            metrics_dict.update(template_metrics_dict)
            metrics_dict["parent_file"] = parent_file 
            metrics_dict["release_date"] = release_version_date
            metrics_dict["version_name"] = version_name 
            metrics_dict["metrics_code"] = metric_code
            
           
            
            metrics_dict["mi_score"]  = mi_score
            all_metrics_list.append(metrics_dict)    
            
    mi_metrics_df = convert_mi_metrics_to_dataframe(all_metrics_list) 
    return mi_metrics_df
    

In [177]:
folder_path = "mi_*.txt"
mi_df_lst = [] 
template_metrics_dict = {} 
for file_name in glob.glob(folder_path):
    df = create_mi_dataframe(file_name,template_metrics_dict)
    mi_df_lst.append(df)
    

mi_final_df = pd.concat(mi_df_lst) 


In [237]:
mi_final_df['date_time'] = pd.to_datetime(mi_final_df["release_date"], format='%a %b %d %H:%M:%S %Y')
mi_final_df.set_index('date_time', inplace=True)
mi_final_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 231603 entries, 2022-02-24 11:37:42 to 2022-08-02 19:50:01
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   parent_file   231603 non-null  object
 1   release_date  231603 non-null  object
 2   version_name  231603 non-null  object
 3   metrics_code  231603 non-null  object
 4   mi_score      231603 non-null  object
dtypes: object(5)
memory usage: 10.6+ MB


In [201]:
save_data_to_file(mi_final_df, "maintainability_index_dataset.csv")

## Generate Raw Metrics data

In [191]:
def create_raw_dataframe(file_name,template_metrics_dict): 
    release_version_date = get_release_version_date(file_name)
    version_name =  get_release_version_name(file_name)
   
    
    cnt = 0 
    metric_code = "raw_metrics"
    all_metrics_list = [] 
    FIRST = True
    
    with open(file_name, 'r', encoding="utf-8") as f:
        for line in f: 
            if "../" in line:
                if FIRST: 
                    metrics_dict = {}
                    FIRST = False
                else:     
                    all_metrics_list.append(metrics_dict)
                    metrics_dict = {}
                line = strip_ansi(line).strip()
                parent_file = line.split("-")[0]
                metrics_dict["parent_file"] = parent_file.split("/")[-1]
                metrics_dict["release_date"] = release_version_date
                metrics_dict["version_name"] = version_name 
            else:
                if "-" not in line:
                    lst = line.split(":")
                    metric_name = lst[0].replace("\n", "").strip()
                    metric_score = lst[1].replace("\n", "").strip()
                    metrics_dict[metric_name] = metric_score
                    metrics_dict["metrics_code"] = metric_code
           
    df = convert_mi_metrics_to_dataframe(all_metrics_list)        
    return df         
                
        
folder_path = "raw_*.txt"
raw_df_lst = [] 
template_metrics_dict = {} 
for file_name in glob.glob(folder_path):
    df = create_raw_dataframe(file_name,template_metrics_dict)
    raw_df_lst.append(df)
  
raw_final_df = pd.concat(raw_df_lst) 

In [211]:
raw_final_df["release_date"].head(1)

0    Thu Sep 1 00:58:22 2016
Name: release_date, dtype: object

In [210]:
import datetime
from datetime import date

date_time_str = '2018-06-29 08:15:27.243860'
date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S.%f')
date_time_obj

datetime.datetime(2018, 6, 29, 8, 15, 27, 243860)

In [235]:
# convert the release date to datetime
raw_final_df['date_time'] = pd.to_datetime(raw_final_df["release_date"], format='%a %b %d %H:%M:%S %Y')
raw_final_df.set_index('date_time', inplace=True)
raw_final_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 257823 entries, 2016-09-01 00:58:22 to 2020-01-14 09:05:04
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   parent_file      257823 non-null  object
 1   release_date     257823 non-null  object
 2   version_name     257823 non-null  object
 3   LOC              257823 non-null  object
 4   metrics_code     257823 non-null  object
 5   LLOC             257823 non-null  object
 6   SLOC             257823 non-null  object
 7   Comments         257823 non-null  object
 8   Single comments  257823 non-null  object
 9   Multi            257823 non-null  object
 10  Blank            257823 non-null  object
 11  (C % L)          257823 non-null  object
 12  (C % S)          257823 non-null  object
 13  (C + M % L)      257823 non-null  object
dtypes: object(14)
memory usage: 29.5+ MB


In [236]:
save_data_to_file(raw_final_df, "raw_metrics_datadataset.csv")

## Generate Halstead complexity metrics data 

In [139]:
def create_hal_dataframe(file_name,template_metrics_dict):
    
    release_version_date = get_release_version_date(file_name)
    version_name =  get_release_version_name(file_name)
    cnt = 0 
    metric_code = "Halstead complexity metrics"
    all_metrics_list = [] 
    FIRST = True
    
    with open(file_name, 'r', encoding="utf-8") as f:
        for line in f: 
            if "../" in line:
                if FIRST: 
                    metrics_dict = {}
                    FIRST = False
                else:     
                    all_metrics_list.append(metrics_dict)
                    metrics_dict = {}
                line = strip_ansi(line).strip()
                parent_file = line.split("-")[0]
                metrics_dict["parent_file"] = parent_file.split("/")[-1]
                metrics_dict["release_date"] = release_version_date
                metrics_dict["version_name"] = version_name 
            else: 
                lst = line.split(":")
                metric_name = lst[0].replace("\n", "").strip()
                metric_score = lst[1].replace("\n", "").strip()
                metrics_dict[metric_name] = metric_score
                
        
    df = convert_mi_metrics_to_dataframe(all_metrics_list)        
    return df  
    

In [183]:
folder_path = "hal_*.txt"
hal_df_lst = [] 
template_metrics_dict = {} 
for file_name in glob.glob(folder_path):
    df = create_hal_dataframe(file_name,template_metrics_dict)
    raw_df_lst.append(df)
    
    
raw_final_df = pd.concat(raw_df_lst) 


In [186]:
raw_final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232359 entries, 0 to 3182
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   parent_file        232359 non-null  object
 1   release_date       232359 non-null  object
 2   version_name       232359 non-null  object
 3   h1                 232359 non-null  object
 4   h2                 232359 non-null  object
 5   N1                 232359 non-null  object
 6   N2                 232359 non-null  object
 7   vocabulary         232359 non-null  object
 8   length             232359 non-null  object
 9   calculated_length  232359 non-null  object
 10  volume             232359 non-null  object
 11  difficulty         232359 non-null  object
 12  effort             232359 non-null  object
 13  time               232359 non-null  object
 14  bugs               232359 non-null  object
dtypes: object(15)
memory usage: 28.4+ MB


In [160]:
raw_final_df.columns

Index(['parent_file', 'release_date', 'version_name', 'LOC', 'metrics_code',
       'LLOC', 'SLOC', 'Comments', 'Single comments', 'Multi', 'Blank',
       '(C % L)', '(C % S)', '(C + M % L)'],
      dtype='object')