This notebook is used for analysing the data extracted from studies belonging to TOM Project.

# Settings

## Install Libs

Install some useful Libs.

In [None]:
!pip install tikzplotlib==0.9.9 -q
!pip install pyvis -q
!pip install network2tikz -q

[?25l[K     |██████▏                         | 10 kB 15.1 MB/s eta 0:00:01[K     |████████████▎                   | 20 kB 17.8 MB/s eta 0:00:01[K     |██████████████████▍             | 30 kB 19.3 MB/s eta 0:00:01[K     |████████████████████████▌       | 40 kB 10.9 MB/s eta 0:00:01[K     |██████████████████████████████▋ | 51 kB 11.3 MB/s eta 0:00:01[K     |████████████████████████████████| 53 kB 1.2 MB/s 
[?25h  Building wheel for pyvis (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 41 kB 80 kB/s 
[?25h

## Import Libs

Import the libs used for data analysis

In [None]:
import pandas as pd
import numpy as np
import altair as altair
import matplotlib.pyplot as plt
plt.style.use("default")

import urllib


from pyvis.network import Network

from IPython.core.display import display, HTML, display_latex
from network2tikz import plot

import networkx as nx

import itertools

# Preparation

## Importing and Exploring the Data

We import the data.

In [None]:
sheet_name = "DATA"
data = pd.read_csv('https://docs.google.com/spreadsheets/d/13RhQTl8ku2OCeXvp3U5zI_W3kWQdXDolXR3G-XT3_DI/gviz/tq?tqx=out:csv&sheet={}'.format(urllib.parse.quote(sheet_name)))


metrics_col_name = "Metric"
anomalies_col_name = "Anomaly"
actions_col_name = "Suggested Action"


def data_info(data = data):

  metrics_count = data[metrics_col_name].nunique()
  anomalies_count = data[anomalies_col_name].nunique()
  actions_count = data[actions_col_name].nunique()
  n = data["Study"].nunique()
  info = pd.DataFrame()

  print()
  print("This data includes {0} Metrics, {1} Anomalies and {2} Actions from {3} studies. In total, {4} data points.".
      format(metrics_count, anomalies_count, actions_count, n, sum([metrics_count, anomalies_count, actions_count])))

data_info(data)

# Data Analysis

## Preliminary Study Clustering

### Distribution of studies by country of origin based on first author’s affiliation

Distribution

In [None]:
data[['Country',"Study"]].drop_duplicates().groupby(["Country"]).count().sort_values("Study",ascending = False)

Unnamed: 0_level_0,Study
Country,Unnamed: 1_level_1
USA,16
India,12
China,4
Finland,4
Brazil,3
Canada,3
England,3
Australia,2
Germany,2
Belgium,1


Country Count

In [None]:
data['Country'].unique().size

18

### Distribution of studies by year of publication

Distribution

In [None]:
data[['Publication year',"Study"]].drop_duplicates().groupby(["Publication year"]).count().sort_values("Publication year",ascending = False)

Unnamed: 0_level_0,Study
Publication year,Unnamed: 1_level_1
2020,5
2019,2
2018,4
2017,4
2016,1
2015,2
2014,2
2013,5
2012,2
2011,2


Study count and percentage in last 10 years

In [None]:
study_total_count = data['Study'].nunique()

study_count_last_10_years = data[['Publication year',"Study"]].drop_duplicates().groupby(["Publication year"]).count().sort_values("Publication year",ascending = False)[:11].sum()[0]

study_total_count, study_count_last_10_years, study_count_last_10_years / study_total_count

(58, 33, 0.5689655172413793)

### Distribution of studies by type of publication

Distribution

In [None]:
data[['Publication type',"Study"]].drop_duplicates().groupby(["Publication type"]).count().sort_values("Publication type",ascending = False)

Unnamed: 0_level_0,Study
Publication type,Unnamed: 1_level_1
Journal,39
Conference,19


Publication Type percentage

In [None]:
study_total_count = data['Study'].nunique()

publication_type_data = data[['Publication type',"Study"]].drop_duplicates().groupby(["Publication type"]).count().sort_values("Publication type",ascending = False)


publication_type_data['Percentage'] = publication_type_data.apply(lambda x: round(x/study_total_count,2))
publication_type_data

Unnamed: 0_level_0,Study,Percentage
Publication type,Unnamed: 1_level_1,Unnamed: 2_level_1
Journal,39,0.67
Conference,19,0.33


## Individual Data analysis

### Excluding data from Study "S53"

We select only important data by excluding S53.

In [None]:
# We will fill empty cells of columns related to the study.
data1 = data
data = data1.drop(data[data['Study']=='S53'].index)
# data = data.drop(data[data['Study']=='S'].index)


# data = data.drop(data.loc[data['Anomaly'].str.contains('not specified').dropna().index[np.where(data['Anomaly'].str.contains('not specified').dropna())]].index)
# data = data.drop(data.loc[data['Suggested Action'].str.contains('not specified').dropna().index[np.where(data['Suggested Action'].str.contains('not specified').dropna())]].index)


# data
data_info(data)


This data includes 384 Metrics, 374 Anomalies and 494 Actions from 57 studies. In total, 1252 data points.


### Generate Stats Table

Citation Info of studies

In [None]:
tom_citation = '''
hendis1981quantifying
nassar1986software
kitchenham1989quantitative
fox1989performance
mays1990experiences
card1988software
porter1990empirically
mays1990applications
boehm1991software
bhandari1993case
zage1995avoiding
lane1997intergrating
travassos1999detecting
ebert1999technical
mohapatra2001defect
leszak2002classification
younessi2003managing
wallace2004software
jalote2005using
wang2006bsr
chambers2006hazard
liu2006intelligent
white2006external
song2006software
shenvi2009defect
tsunoda2010modeling
bjorndal2010global
kapova2010evaluating
kumaresh2010defect
chowdhury2011using
nayak2012reliable
kumaresh2012experimental
kumaresh2012defect
dalal2013empirical
oyetoyan2013study
chang2013integrating
kumaresh2013software
fatima2013improving
lehtinen2014perceived
korkala2014waste
lavallee2015good
lehtinen2015development
yan2016automatically
kumar2017empirical
lehtinen2017recurring
shi2017metric
hu2017defect
freire2018bayesian
treude2018unusual
saini2018change
brito2018investigating
zong2019classification
caulo2020taxonomy
anastassiu2020resistance
mas2020supporting
saidani2020predicting
kolahdouz2020technical
abaei2020fuzzy
'''



citation = tom_citation.split("\n")[+1:-1]

# studies = [S]
studies = ["S" + str(d+1).zfill(2) for d in range(len(citation))]
study_mapping = dict(zip(studies, citation))
study_mapping

{'S01': 'hendis1981quantifying',
 'S02': 'nassar1986software',
 'S03': 'kitchenham1989quantitative',
 'S04': 'fox1989performance',
 'S05': 'mays1990experiences',
 'S06': 'card1988software',
 'S07': 'porter1990empirically',
 'S08': 'mays1990applications',
 'S09': 'boehm1991software',
 'S10': 'bhandari1993case',
 'S11': 'zage1995avoiding',
 'S12': 'lane1997intergrating',
 'S13': 'travassos1999detecting',
 'S14': 'ebert1999technical',
 'S15': 'mohapatra2001defect',
 'S16': 'leszak2002classification',
 'S17': 'younessi2003managing',
 'S18': 'wallace2004software',
 'S19': 'jalote2005using',
 'S20': 'wang2006bsr',
 'S21': 'chambers2006hazard',
 'S22': 'liu2006intelligent',
 'S23': 'white2006external',
 'S24': 'song2006software',
 'S25': 'shenvi2009defect',
 'S26': 'tsunoda2010modeling',
 'S27': 'bjorndal2010global',
 'S28': 'kapova2010evaluating',
 'S29': 'kumaresh2010defect',
 'S30': 'chowdhury2011using',
 'S31': 'nayak2012reliable',
 'S32': 'kumaresh2012experimental',
 'S33': 'kumaresh2012

In [None]:
'''
Returns the citation of studies
'''
def study2citation(list_of_studies):
  list_of_citations = []
  for s in list_of_studies:
    list_of_citations.append(study_mapping[s])

  return list_of_citations


study2citation(['S01','S02'])

['hendis1981quantifying', 'nassar1986software']

In [None]:
'''
Returns statistics dataframe about the connections among all data types.

'''
def stats(data1, latex_file_name = "stats.tex"):

  data = data1[['Study', metrics_col_name, anomalies_col_name,actions_col_name]].dropna(how="all").drop_duplicates()
  counts = [] 

  df = pd.DataFrame(columns = ['Samples','Metrics', 'Anomalies', 'Actions', 'Metric-Anomaly', 'Anomaly-Action', 'Metric-Action', 'Metric-Anomaly-Action'], index = np.append(np.append(data['Study'].unique(),"Total"), "Count") )
  for study in data['Study'].unique():
    df.at[study, df.columns[0]] = len(data[data['Study']==study])

    df.at[study, df.columns[1]] = data[data['Study']==study][metrics_col_name].nunique()
    df.at[study, df.columns[2]] = data[data['Study']==study][anomalies_col_name].nunique()
    df.at[study, df.columns[3]] = data[data['Study']==study][actions_col_name].nunique()


    col1 = metrics_col_name
    col2 = anomalies_col_name
    mask = data[data['Study']==study][col1].notnull()&data[data['Study']==study][col2].notnull()
    mn = len(data.iloc[np.where(mask)[0]])
    df.at[study, df.columns[4]] = mn
    

    col2 = actions_col_name
    col1 = anomalies_col_name
    mask = data[data['Study']==study][col1].notnull()&data[data['Study']==study][col2].notnull()
    mn = len(data.iloc[np.where(mask)[0]])
    df.at[study, df.columns[5]] = mn



    col1 = metrics_col_name
    col2 = actions_col_name
    mask = data[data['Study']==study][col1].notnull()&data[data['Study']==study][col2].notnull()  #&data[data['Study']==study][anomalies_col_name].isna()
    mn = len(data.iloc[np.where(mask)[0]])
    df.at[study, df.columns[6]] = mn

    col1 = metrics_col_name
    col2 = actions_col_name
    col3 = anomalies_col_name
    mask = data[data['Study']==study][col1].notnull()&data[data['Study']==study][col2].notnull()&data[data['Study']==study][col3].notnull()
    mn = len(data.iloc[np.where(mask)[0]])
    df.at[study, df.columns[7]] = mn
    


  v = data[metrics_col_name].nunique()
  counts.append(v)
  
  v = data[anomalies_col_name].nunique()
  counts.append(v)


  v = data[actions_col_name].nunique()
  counts.append(v)


  v = (data[metrics_col_name] + data[anomalies_col_name]).nunique()
  counts.append(v)


  v = (data[anomalies_col_name] + data[actions_col_name]).nunique() 
  counts.append(v)

  # dd = data[[metrics_col_name, actions_col_name]].drop_duplicates().dropna()
# dd = data.loc[data[anomalies_col_name].isna().index][[metrics_col_name, actions_col_name]].drop_duplicates().dropna()
  # v = dd.nunique()[0]
  v = (data[actions_col_name] + data[metrics_col_name]).nunique()
  counts.append(v)

  v = (data[metrics_col_name] + data[anomalies_col_name] + data[actions_col_name]).nunique()
  counts.append(v)

  

  T = (data[metrics_col_name].fillna("") + data[anomalies_col_name].fillna("") + data[actions_col_name].fillna("")).nunique()

  counts.insert(0,T)
   
  for col in df.columns:
    df.at["Total", col] = df[col].sum()
  
  df.at["Count", :] = counts

  df.replace(0, "", inplace=True)
  
  # Exclude the study S53
  citation_app = citation
  if "caulo2020taxonomy" in citation_app:
    citation_app.remove("caulo2020taxonomy")

  citation_app = np.append(np.append(citation_app,"Total"), "Count") 

  df.index = map(lambda x: str('\citet{'+x+'}') if (x!="Total" or x!="Count") else x, citation_app)
 
  if (latex_file_name):
    df.to_latex(latex_file_name, escape = False, na_rep = "0")
  

  return df


df = stats(data, "stats.tex")


df

Unnamed: 0,Samples,Metrics,Anomalies,Actions,Metric-Anomaly,Anomaly-Action,Metric-Action,Metric-Anomaly-Action
\citet{hendis1981quantifying},9,2.0,7.0,,,,,
\citet{nassar1986software},14,4.0,,10.0,,,1.0,
\citet{kitchenham1989quantitative},46,29.0,4.0,17.0,,17.0,,
\citet{fox1989performance},4,1.0,4.0,4.0,1.0,4.0,1.0,1.0
\citet{mays1990experiences},18,,4.0,18.0,,18.0,,
\citet{card1988software},20,15.0,1.0,5.0,,1.0,,
\citet{porter1990empirically},4,4.0,1.0,,4.0,,,
\citet{mays1990applications},2,,2.0,2.0,,2.0,,
\citet{boehm1991software},10,,10.0,10.0,,10.0,,
\citet{bhandari1993case},9,9.0,7.0,7.0,9.0,9.0,9.0,9.0


In [None]:
assert not any(data['Study']=='S53'), "Study S53 should be removed since it is an outlier"

### Metrics Data

#### Metrics

Metric distribution in studies

In [None]:
temp = data[['Metric','Study']].drop_duplicates().dropna().groupby("Metric").count().sort_values("Study",ascending=False).head(15)

temp['Studies'] = None
temp['Citation'] = None

for s in temp.index:
  temp.at[s, 'Studies'] = data[data['Metric']==s]['Study'].unique()
  temp.at[s, 'Citation'] = study2citation(data[data['Metric']==s]['Study'].unique())

print(temp.to_string())

                                       Study                                                       Studies                                                                                                                                                                                                                                                      Citation
Metric                                                                                                                                                                                                                                                                                                                                                                  
loc                                       12  [S02, S03, S06, S07, S12, S14, S28, S30, S44, S49, S52, S58]  [nassar1986software, kitchenham1989quantitative, card1988software, porter1990empirically, lane1997intergrating, ebert1999technical, kapova2010evaluating, chowdhury2011usi

Studies of "loc" metric

In [None]:
studies_loc_metric = data[data['Metric']=='loc'][['Study', "Metric"]].drop_duplicates()
studies_loc_metric = [study_mapping[key] for key in studies_loc_metric['Study']]
studies_loc_metric

['nassar1986software',
 'kitchenham1989quantitative',
 'card1988software',
 'porter1990empirically',
 'lane1997intergrating',
 'ebert1999technical',
 'kapova2010evaluating',
 'chowdhury2011using',
 'kumar2017empirical',
 'treude2018unusual',
 'zong2019classification',
 'abaei2020fuzzy']

Study Count which provided metrics

In [None]:
data[['Metric','Study']].dropna()['Study'].nunique() # + 1 # The study that we removed from our results S53


42

Metric count of S53 (Caulo et al. 2020)

In [None]:
data1[data1['Study']=='S53']['Raw metric'].nunique()

509

Total Metric count

In [None]:
data[['Metric']].nunique()

Metric    384
dtype: int64

Metric category count

In [None]:
metric_category_dist = data[['Metric category','Metric']].dropna(how='all').drop_duplicates().groupby("Metric category").count()
metrics_count = data[['Metric']].nunique()[0]

metric_category_dist['Percentage'] = metric_category_dist['Metric'].apply(lambda x: round(x / metrics_count, 2))

metric_category_dist

Unnamed: 0_level_0,Metric,Percentage
Metric category,Unnamed: 1_level_1,Unnamed: 2_level_1
process,260,0.68
product,125,0.33
resources,4,0.01


Metrics used as action triggers or anomaly indicators

In [None]:
data.loc[data.loc[data['Metric'].dropna().index][['Anomaly', 'Suggested Action']].dropna(how="all").index][['Metric','Anomaly', 'Suggested Action']].nunique()

Metric              231
Anomaly             124
Suggested Action    141
dtype: int64

Anomaly-Metric-Action Connection components

In [None]:
temp = data[['Metric','Anomaly','Suggested Action']].dropna().drop_duplicates()

print(temp.count()[0])

temp.nunique()

112


Metric              49
Anomaly             85
Suggested Action    89
dtype: int64

#### Metrics used as anomaly indicators

Distribution of software metrics used as anomaly indicators

In [None]:
temp3 = data.loc[data[['Metric', 'Anomaly']].dropna().index][['Study','Metric']].dropna().drop_duplicates()

metric_study_count = temp3.groupby("Metric").count().sort_values("Study", ascending = False)

temp2 = data.loc[temp3.index]

temp = metric_study_count

temp['Studies'] = None
temp['Citation'] = None

for s in temp.index:
  temp.at[s, 'Studies'] = temp2[temp2['Metric']==s]['Study'].unique()
  temp.at[s, 'Citation'] = study2citation(temp2[temp2['Metric']==s]['Study'].unique())

print(temp.head(10).to_string())

                                       Study                         Studies                                                                                                                  Citation
Metric                                                                                                                                                                                                
loc                                        6  [S07, S12, S30, S44, S49, S58]  [porter1990empirically, lane1997intergrating, chowdhury2011using, kumar2017empirical, treude2018unusual, abaei2020fuzzy]
cyclomatic complexity                      4            [S07, S30, S44, S58]                                           [porter1990empirically, chowdhury2011using, kumar2017empirical, abaei2020fuzzy]
# change requests                          3                 [S40, S41, S49]                                                                   [korkala2014waste, lavallee2015good, treude2018unusual]
# def

Studies which used 'loc' as anomaly indicator

In [None]:
studies_metric_anomaly = data.loc[data[['Metric', 'Anomaly']].dropna().drop_duplicates().index][['Study','Metric']].dropna().drop_duplicates().sort_values("Metric")

t = studies_metric_anomaly[studies_metric_anomaly['Metric']=='loc'].sort_values("Study")
t['Mapping'] = t['Study'].map(lambda x: study_mapping[x])
t

Unnamed: 0,Study,Metric,Mapping
116,S07,loc,porter1990empirically
143,S12,loc,lane1997intergrating
497,S30,loc,chowdhury2011using
742,S44,loc,kumar2017empirical
828,S49,loc,treude2018unusual
1608,S58,loc,abaei2020fuzzy


Study count used metrics as anomaly indicators

In [None]:
data.loc[data[['Metric', 'Anomaly']].dropna().index][['Study','Metric']].dropna().drop_duplicates()['Study'].nunique()

27

Metric count used as anomaly indicators

In [None]:
data.loc[data[['Metric', 'Anomaly']].dropna().index][['Metric']].dropna().drop_duplicates()['Metric'].nunique()

155

Anomaly count detected by metrics

In [None]:
data.loc[data[['Metric', 'Anomaly']].dropna().index][['Anomaly']].dropna().drop_duplicates()['Anomaly'].nunique()

124

Top software metrics ranked by number of anomalies detected

In [None]:
temp1 = data.loc[data[['Metric', 'Anomaly']].dropna().index][['Metric','Anomaly']].dropna().drop_duplicates()

temp2 = data.loc[temp1.index]

temp3 = temp1.groupby("Metric").count().sort_values("Anomaly", ascending = False).head(10)


temp = temp3

temp['Studies'] = None
temp['Citation'] = None

for s in temp.index:
  temp.at[s, 'Studies'] = temp2[temp2['Metric']==s]['Study'].unique()
  temp.at[s, 'Citation'] = study2citation(temp2[temp2['Metric']==s]['Study'].unique())


print(temp.to_string())

                                              Anomaly                         Studies                                                                                                                  Citation
Metric                                                                                                                                                                                                         
defect injection metric                            30                           [S33]                                                                                                      [kumaresh2012defect]
frequnce of interactions, number of meetings       10                           [S41]                                                                                                        [lavallee2015good]
defect density                                     10                      [S26, S29]                                                                                 [t

In [None]:
temp

Unnamed: 0_level_0,Anomaly,Studies,Citation
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
defect injection metric,30,[S33],[kumaresh2012defect]
"frequnce of interactions, number of meetings",10,[S41],[lavallee2015good]
defect density,10,"[S26, S29]","[tsunoda2010modeling, kumaresh2010defect]"
loc,7,"[S07, S12, S30, S44, S49, S58]","[porter1990empirically, lane1997intergrating, ..."
cyclomatic complexity,4,"[S07, S30, S44, S58]","[porter1990empirically, chowdhury2011using, ku..."
# change requests,3,"[S40, S41, S49]","[korkala2014waste, lavallee2015good, treude201..."
# defects,3,"[S10, S16, S26]","[bhandari1993case, leszak2002classification, t..."
efforts (time spent) in coding,2,[S34],[dalal2013empirical]
coupling between object classes (cbo),2,"[S30, S44]","[chowdhury2011using, kumar2017empirical]"
depth of inheritance tree (dit),2,"[S30, S44]","[chowdhury2011using, kumar2017empirical]"


Studies of 'loc' used as anomaly indicator

In [None]:
temp = data.loc[data.loc[data[['Metric', 'Anomaly']].dropna().drop_duplicates().index][['Metric','Anomaly']].dropna().drop_duplicates().index]['Metric']


temp2 = data.loc[temp[temp=='loc'].index][['Study','Metric']].dropna().drop_duplicates()
temp2['citation'] = [study_mapping[k] for k in temp2['Study']]

temp2


Unnamed: 0,Study,Metric,citation
116,S07,loc,porter1990empirically
143,S12,loc,lane1997intergrating
497,S30,loc,chowdhury2011using
742,S44,loc,kumar2017empirical
828,S49,loc,treude2018unusual
1608,S58,loc,abaei2020fuzzy


Studies of 'cyclomatic complexity' used as anomaly indicator

In [None]:
temp = data.loc[data.loc[data[['Metric', 'Anomaly']].dropna().drop_duplicates().index][['Metric','Anomaly']].dropna().drop_duplicates().index]['Metric']


temp2 = data.loc[temp[temp=='cyclomatic complexity'].index][['Study','Metric']].dropna().drop_duplicates()
temp2['citation'] = [study_mapping[k] for k in temp2['Study']]

temp2


Unnamed: 0,Study,Metric,citation
114,S07,cyclomatic complexity,porter1990empirically
491,S30,cyclomatic complexity,chowdhury2011using
750,S44,cyclomatic complexity,kumar2017empirical
1609,S58,cyclomatic complexity,abaei2020fuzzy


#### Metrics used as action triggers

Metric count used as action triggers

In [None]:
# temp = data.loc[data.loc[data[data.isna()['Anomaly']].index][['Suggested Action','Metric']].dropna().drop_duplicates().index][['Metric']].dropna().drop_duplicates()
# 
temp = data.loc[data[['Suggested Action','Metric']].dropna().drop_duplicates().index][['Metric']].dropna().drop_duplicates()

temp.count()[0]

125

Studies used metrics as action triggers

In [None]:
# temp = data.loc[data.loc[data[data.isna()['Anomaly']].index][['Suggested Action','Metric']].dropna().drop_duplicates().index][['Study']].dropna().drop_duplicates()

temp = data.loc[data[['Suggested Action','Metric']].dropna().drop_duplicates().index][['Study']].dropna().drop_duplicates()

temp = [study_mapping[k] for k in temp['Study']]

temp, len(temp)

(['nassar1986software',
  'fox1989performance',
  'bhandari1993case',
  'zage1995avoiding',
  'lane1997intergrating',
  'ebert1999technical',
  'mohapatra2001defect',
  'leszak2002classification',
  'jalote2005using',
  'wang2006bsr',
  'tsunoda2010modeling',
  'kumaresh2010defect',
  'kumaresh2012experimental',
  'kumaresh2012defect',
  'oyetoyan2013study',
  'korkala2014waste',
  'lavallee2015good',
  'hu2017defect',
  'brito2018investigating'],
 19)

Action count which are triggered by metrics

In [None]:
temp = data.loc[data[['Suggested Action','Metric']].dropna().drop_duplicates().index][['Suggested Action']].dropna().drop_duplicates()

# temp = data.loc[data.loc[data[data.isna()['Anomaly']].index][['Suggested Action','Metric']].dropna().drop_duplicates().index][['Suggested Action']].dropna().drop_duplicates()


temp.count()[0]

141

Action category which are triggered by metrics

In [None]:
temp = data.loc[data[['Suggested Action','Metric']].dropna().drop_duplicates().index][['Suggested Action']].dropna().drop_duplicates()

# temp = data.loc[data.loc[data[data.isna()['Anomaly']].index][['Suggested Action','Metric']].dropna().drop_duplicates().index][['Suggested Action']].dropna().drop_duplicates()

temp1 = data.loc[temp.index][['Suggested Action', 'Action category']]
temp2 = temp1.groupby("Action category").count()

temp2

Unnamed: 0_level_0,Suggested Action
Action category,Unnamed: 1_level_1
corrective,30
enhancement,56
preventive,54


Metric categories used as action triggers

In [None]:
# temp = data.loc[data.loc[data[data.isna()['Anomaly']].index][['Suggested Action','Metric']].dropna().drop_duplicates().index][['Metric']].dropna().drop_duplicates()

temp = data.loc[data[['Suggested Action','Metric']].dropna().drop_duplicates().index][['Metric']].dropna().drop_duplicates()

data.loc[temp.index][["Metric category", "Metric"]].drop_duplicates().dropna().groupby("Metric category").count()


Unnamed: 0_level_0,Metric
Metric category,Unnamed: 1_level_1
process,109
product,16


Distribution of metrics used as action triggers of category preventive or corrective

In [None]:
# data.loc[data.loc[data[data.isna()['Anomaly']].index][['Suggested Action','Metric']].dropna().index][['Study', 'Metric']].drop_duplicates().dropna().groupby("Metric").count().sort_values("Study")

# Take into account only 'preventive' and 'corrective' actions, so we remove 'enhancement' actions.
temp0 = data['Action category'].dropna()
temp01 = data.loc[temp0.index]['Action category']
temp0 = temp01[temp01!='enhancement']
temp0 = data.loc[temp0.index]
# temp0 = data

# Select all data which has both metrics and actions.
temp1 = data.loc[temp0[['Suggested Action','Metric']].dropna().index][['Study', 'Metric']].drop_duplicates().dropna()

temp2 = data.loc[temp1.index]

# Group by Metric and count studies.
temp3 = temp1.groupby("Metric").count().sort_values("Study",ascending  = False).head(10)


temp = temp3

temp['Studies'] = None
temp['Citation'] = None

for s in temp.index:
  temp.at[s, 'Studies'] = temp2[temp2['Metric']==s]['Study'].unique()
  temp.at[s, 'Citation'] = study2citation(temp2[temp2['Metric']==s]['Study'].unique())

# temp1
print(temp.to_string())
# temp0

                                              Study          Studies                                                           Citation
Metric                                                                                                                                 
# defects                                         3  [S10, S16, S26]  [bhandari1993case, leszak2002classification, tsunoda2010modeling]
# change requests                                 2       [S40, S41]                               [korkala2014waste, lavallee2015good]
defect density                                    2       [S26, S29]                          [tsunoda2010modeling, kumaresh2010defect]
# capability defects                              1            [S10]                                                 [bhandari1993case]
number of 'team historians'                       1            [S41]                                                 [lavallee2015good]
defects in release                              

<!-- No commonly used metrics as action triggers -->

Commonly used metrics as action triggers exist

Distribution of metrics used as action triggers of category enhancement

In [None]:
# data.loc[data.loc[data[data.isna()['Anomaly']].index][['Suggested Action','Metric']].dropna().index][['Study', 'Metric']].drop_duplicates().dropna().groupby("Metric").count().sort_values("Study")

# Take into account only 'preventive' and 'corrective' actions, so we remove 'enhancement' actions.
temp0 = data['Action category'].dropna()
temp01 = data.loc[temp0.index]['Action category']
temp0 = temp01[temp01=='enhancement']
temp0 = data.loc[temp0.index]
# temp0 = data

# Select all data which has both metrics and actions.
temp1 = data.loc[temp0[['Suggested Action','Metric']].dropna().index][['Study', 'Metric']].drop_duplicates().dropna()


aa = temp0.drop_duplicates()
print("Number of enhancement actions triggered by metric 'productivity in function points' in 2 studies > ", len(aa[aa['Metric'] == 'productivity in function points']))
# print()

temp2 = data.loc[temp1.index]

# Group by Metric and count studies.
temp3 = temp1.groupby("Metric").count().sort_values("Study",ascending  = False).head(10)


temp = temp3

temp['Studies'] = None
temp['Citation'] = None

for s in temp.index:
  temp.at[s, 'Studies'] = temp2[temp2['Metric']==s]['Study'].unique()
  temp.at[s, 'Citation'] = study2citation(temp2[temp2['Metric']==s]['Study'].unique())


print(temp.to_string())


Number of enhancement actions triggered by metric 'productivity in function points' in 2 studies >  14
                                 Study     Studies                                        Citation
Metric                                                                                            
productivity in function points      2  [S12, S51]  [lane1997intergrating, brito2018investigating]
actual procurement time              1       [S51]                        [brito2018investigating]
review duration                      1       [S51]                        [brito2018investigating]
review speed                         1       [S51]                        [brito2018investigating]
review rate                          1       [S51]                        [brito2018investigating]
review preparation rate              1       [S51]                        [brito2018investigating]
review performance                   1       [S51]                        [brito2018investigating]
review

Top software metrics ranked by number of actions suggested

In [None]:

# temp1 = data['Anomaly'].isna()
# data[temp1]


temp2 = data['Action category'].dropna()

temp3 = data.loc[temp2[temp2 != 'enhancement'].index]

temp4 = temp3[['Metric','Suggested Action',"Action category"]].dropna()

temp5 = temp4.groupby(["Metric","Action category"]).count().sort_values(["Suggested Action","Action category"],ascending = False)

temp5.head(10)

# study2citation(data.loc[temp4.index]['Study'].unique())

Unnamed: 0_level_0,Unnamed: 1_level_0,Suggested Action
Metric,Action category,Unnamed: 2_level_1
defect injection metric,preventive,30
defect density,preventive,19
"frequnce of interactions, number of meetings",corrective,10
defect density,corrective,4
# defects,corrective,3
# change requests,corrective,2
# identified errors,preventive,1
# identified faults,preventive,1
# lapses,preventive,1
# number of human mistakes,preventive,1


### Anomaly Data

Anomaly count

In [None]:
data.loc[data['Anomaly'].dropna().drop_duplicates().index]['Anomaly'].nunique()

374

Root Causes count

In [None]:
data.loc[data['Anomaly'].dropna().drop_duplicates().index]['Root causes'].dropna().drop_duplicates().nunique()


142

Study count which provides anomaly data

In [None]:
data.loc[data[['Study','Anomaly']].dropna().drop_duplicates().index][['Study']].dropna().drop_duplicates().nunique()

Study    45
dtype: int64

Top software anomalies ranked by the number of suggested actions

In [None]:
temp1 = data[['Anomaly', 'Suggested Action', "Action category"]].dropna()

temp2 = temp1.drop_duplicates()
#"Action category"
temp3 = temp2.groupby(["Anomaly"]).count().sort_values(["Suggested Action"], ascending = False)

temp4 = temp2.groupby(["Anomaly", "Action category"]).count().sort_values(["Suggested Action"], ascending = False)


temp3.head(10)
# temp4.index = temp4.index.swaplevel(0, 1)
# temp4.sort_index(axis = 1, level=['Anomaly', 'Action category']).sortlevel()

temp5 = temp4.reset_index(1)

temp6 = temp5.reset_index()

temp5['Total action'] = temp6[['Anomaly','Suggested Action']].groupby("Anomaly").sum()
# temp5[['Anomaly','Suggested Action']].groupby("Anomaly").sum()
# temp5

temp5.head(10)

Unnamed: 0_level_0,Action category,Suggested Action,Total action
Anomaly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
resistance to change,preventive,27,29
inaccurate estimation of efforts,corrective,15,15
systemic defects into software.,preventive,11,12
insufficient development work outcome,corrective,10,10
a lot of bugs encountered,corrective,10,10
oversight issue,preventive,8,8
the technical activities needed in a particular stage of development appear to have been inadequately performed.,corrective,7,7
casual defects of operations and others,preventive,5,5
lack of information on how the system should work,corrective,5,5
vague customer needs,preventive,4,4


Top software anomalies ranked by number of metrics used in detection

In [None]:
temp1 = data[['Anomaly','Metric']].dropna().drop_duplicates()
temp2 = temp1.groupby("Anomaly").count().sort_values("Metric", ascending = False).reset_index()
print(temp2.head(15).to_string())

                                                                                                                                                                                                      Anomaly  Metric
0                                                                                                                                                                                            ci build failure      33
1                                                                                                                                                           the class in objected oriented software is faulty      18
2                                                                                                                                                                             vulnerability files in software      13
3                                                                                                                                               

### Actions Data

Action count

In [None]:
data.loc[data['Suggested Action'].dropna().drop_duplicates().index]['Suggested Action'].nunique()


494

In [None]:
action_count = data.loc[data['Suggested Action'].dropna().drop_duplicates().index][['Suggested Action', 'Action category']].dropna().drop_duplicates().nunique()
temp = data.loc[data['Suggested Action'].dropna().drop_duplicates().index][['Suggested Action', 'Action category']].dropna().drop_duplicates().groupby("Action category").count()

temp['Percentage'] = temp['Suggested Action'] / action_count[0]

temp

Unnamed: 0_level_0,Suggested Action,Percentage
Action category,Unnamed: 1_level_1,Unnamed: 2_level_1
corrective,175,0.35497
enhancement,68,0.137931
preventive,250,0.507099


Action count in CMMI key process areas

In [None]:
temp1 = data[['Suggested Action', 'CMMI']].dropna().drop_duplicates()

temp1.groupby("CMMI").count().sort_values("CMMI")

Unnamed: 0_level_0,Suggested Action
CMMI,Unnamed: 1_level_1
casual analysis and resolution,3
configuration management,16
integrated project management,96
measurement and analysis,4
organizational innovation and deployment,1
organizational process definition,13
organizational process focus,39
organizational process performance,12
organizational training,36
process and product quality assurance,23


Top preventive and corrective actions ranked by the number of metrics used in triggering the action

In [None]:
temp = data['Action category'].dropna()

temp = temp[temp!='enhancement']

temp1 = data.loc[temp.index][['Suggested Action', 'Metric']].dropna().drop_duplicates()

temp2 = temp1.groupby("Suggested Action").count().sort_values("Metric", ascending = False)
print(temp2.head(10).reset_index().to_string())

                                                                                                                                                                         Suggested Action  Metric
0                                                                                                                                   train developers to understand the requirement errors       6
1                                                                                                                                      emphasize nonfunctional areas in succeeding phases       3
2                                                                                                                                reduce or avoid cyclic dependencies in depelopment stage       3
3  team members should make sure that knowledge is appropriately distributed amongst them.  for example, pair programming, code review are practices which can promote knowledge sharing.       3
4                             

Action count suggested for fixing anomalies

In [None]:
# data[data[['Action category']]=='enhancement']['Anomaly'].dropna()

temp1 = data[['Anomaly', 'Suggested Action']].dropna().drop_duplicates()
print(temp1.count()[0])
temp1.nunique()

409


Anomaly             224
Suggested Action    360
dtype: int64

# Display Data as a Graph

##Build Graph

Get all data types

In [None]:

col = metrics_col_name

def getvalues(data=data, cols=[metrics_col_name]) :
    #if cols[0] in data.columns:
  #  for col in cols
        #print(67)
        return data.dropna(subset=cols)[cols]
        #return pd.unique(data.loc[data.where(, cols]) 

metrics = getvalues(data, [metrics_col_name])[metrics_col_name].tolist()
#print(metrics)
anomalies = getvalues(data, [anomalies_col_name] )[anomalies_col_name].tolist() 
actions = getvalues(data, [actions_col_name] )[actions_col_name].tolist() 

len(metrics)
#print(anomalies['Anomaly'].tolist()) 

#graph.add_nodes(metrics) # color = ["blue" for x in range(len(metrics)) ])
#graph.add_nodes(anomalies) 
#graph.add_nodes(actions)
#graph.add_edge(



#show_graph()
#len(graph.get_nodes()), len(graph.get_edges())

588

Get Only connected data

In [None]:
connected = getvalues(data, [metrics_col_name, anomalies_col_name, actions_col_name])

connected

Unnamed: 0,Metric,Anomaly,Suggested Action
71,ratio of the performance gain to the effort af...,"incorrect, inexact or overconstraining interpr...",active participation of project system enginee...
129,# defects,lack of communication between subteams,"repeat inspections, teach-the-team sessions du..."
130,# capability defects,nonfunctional areas of design not adequately a...,emphasize nonfunctional areas in succeeding ph...
131,# performance defects,nonfunctional areas of design not adequately a...,emphasize nonfunctional areas in succeeding ph...
132,# usability defects,nonfunctional areas of design not adequately a...,emphasize nonfunctional areas in succeeding ph...
...,...,...,...
800,# identified faults,developer forgets a goal in the middle of a se...,train developers to understand the requirement...
801,# slips,developer commits planning errors such that s/...,train developers to understand the requirement...
802,# lapses,cognitive bias among developers (people errors),train developers to understand the requirement...
803,# number of human mistakes,the team selects the inappropriate process for...,train developers to understand the requirement...


## Draw Graph

In [None]:
def multilayered_graph(cols, *subset_sizes):

    l = {metrics_col_name:0, anomalies_col_name:1, actions_col_name:2}


    connected = getvalues(data, cols)


    for i in range(len(cols)):

        nodes = connected.iloc[:,i]

        G.add_nodes_from(nodes, layer=l[cols[i]], group=l[cols[i]]) # , s = node_shapes[cols[i]]

    for _, item in connected.iterrows():

      for i in cols:
        for j in range(1, len(cols)):
          if (not pd.isna(item[cols[j]]) and 
              not pd.isna(item[cols[j-1]]) and 
              not G.has_edge(item[cols[j-1]], item[cols[j]])
              ):
            if (not nx.is_simple_path(G, [item[cols[j-1]], item[cols[j]]])):
              G.add_edge(item[cols[j-1]], item[cols[j]])

    return G



subset_sizes = [metrics, anomalies, actions]

print(metrics, anomalies, actions)
subset_color = [
    "blue",
    "red",
    "green",
]


G = nx.Graph()


cols = [metrics_col_name, anomalies_col_name, actions_col_name]
G = multilayered_graph(cols, *subset_sizes)


cols = [metrics_col_name, anomalies_col_name]
G = multilayered_graph(cols, *subset_sizes)


cols = [anomalies_col_name, actions_col_name]
G = multilayered_graph(cols, *subset_sizes)


cols = [metrics_col_name, actions_col_name]
G = multilayered_graph(cols, *subset_sizes)

color = [subset_color[d["layer"]] for v,d in G.nodes(data=True) if 'layer' in d]



k = 0.06

layout = nx.spring_layout(G, k = k)#, k=0.4)

# layout = nx.shell_layout(G)#, k=0.4)

# layout = nx.random_layout(G)
# layout = None

# layout = nx.spectral_layout(G)
# layout = nx.nx_pydot.graphviz_layout(G)
# layout = nx.kamada_kawai_layout(G)

# layout = nx.multipartite_layout(G, subset_key="layer")

# layout1 = nx.circular_layout(G)


import matplotlib.patches as mpatches


blue_patch = mpatches.Patch(color='blue', label='Metrics')
red_patch = mpatches.Patch(color='red', label='Anomalies')
green_patch = mpatches.Patch(color='green', label='Actions')

# layout = nx.multipartite_layout(G, subset_key="layer")
plt.figure(figsize=(100, 100))

plt.legend(handles = [blue_patch, red_patch, green_patch],prop={'size': 80})



# nx.draw_networkx_nodes(G, pos = layout, node_color=color, node_size=100, linewidths = 10)
# nx.draw_networkx_edges(G, pos = layout)
nx.draw(G, pos = layout, node_color=color, with_labels=False, node_size=600) #, edgecolors = "black", )

plt.savefig("Graph.svg", format="SVG", bbox_inches = 'tight')

# plt.axis("equal")
plt.show()

Output hidden; open in https://colab.research.google.com to view.

### Graph for Latex

In [None]:
graph = G
nodes = graph.nodes
# print(list(nodes))
if 'label' in list(nodes)[0]:
  nodes = list(map(lambda x: x['label'], nodes))
else:
  nodes = list(map(lambda x: x, nodes))

edges = graph.edges

if 'from' in list(edges)[0]:
  edges = list(map(lambda x: (x['from'], x['to']), edges))
else: 
  edges = list(map(lambda x: (x[0], x[1]), edges))




node_colors = list(dict(G.nodes(data='group')).values())

style = {}
style['node_opacity'] = .5
style['edge_curved'] = .1
style['vertex_size'] = .01
style['vertex_label'] = {}
style['node_color'] = [subset_color[g] for g in node_colors]
# style['node_color'] = node_colors



plot((nodes,edges), 'network.tex', **style)

### Show graph in HTML

In [None]:
def show_graph(graph, w = '500px', h = '500px', title="sample.html"):
   # graph.show_buttons(filter_=['physics'])
   nt = Network(w, h)
   nt.from_nx(graph)
   nt.show(title)
  
   display(HTML(title))

show_graph(graph, title = "index.html")