In [7]:
import pandas as pd

categories = ('Bad practice (BAD_PRACTICE)', 'Correctness (CORRECTNESS)', 'Experimental (EXPERIMENTAL)',
             'Internationalization (I18N)', 'Malicious code vulnerability (MALICIOUS_CODE)', 
              'Multithreaded correctness (MT_CORRECTNESS)', 'Bogus random noise (NOISE)',
              'Performance (PERFORMANCE)', 'Security (SECURITY)', 'Dodgy code (STYLE)',
             )

labels = ('No for regex limitation', 'No for multi-line', 'Yes but no time', 'Yes and implemented')

def load_excel():
    return pd.read_excel(
        'Study_of_Spotbugs.xlsx', sheet_name='Study',
        usecols = [i for i in range(19) if i not in (2, 4)],
        names=['pattern', 'category', 'regex_feasibility',
               'inter-class',
               'class-import', 'class-signature', 'class-field', 'class-body',
               'method-signature', 'method-local', 'method-field', 'method-body',
               'statement', 
               'type', 'annotation', 'java-version', 'techniques'
              ])                   

df = load_excel()

## 统计方法
### 方法1
```py
len(df[df['inter-class'] == 'Y'])
```
### 方法2
```py
inter_class_series = df.apply(lambda x: True if x['inter-class'] == 'Y' else False , axis=1)
print(len(inter_class_series[inter_class_series == True].index))
```

In [8]:
import json

# Regex feasibility in each category
stat_dic = {}

for cat in categories:
    stat_dic[cat] = dict()
    for label in labels:
        number = df[
                      (
                          (df['category']==cat) & (df['regex_feasibility']==label)
                      )
                  ].shape[0]
        stat_dic[cat][label] = number
    
print(json.dumps(stat_dic, indent=4))


print('--------- data for table 3 -------------')
total = 0
for _,data in stat_dic.items():
    total += data['Yes and implemented']
print(total)

target = ('Bad practice (BAD_PRACTICE)', 'Correctness (CORRECTNESS)', 'Malicious code vulnerability (MALICIOUS_CODE)', 'Multithreaded correctness (MT_CORRECTNESS)', 'Dodgy code (STYLE)', 'Performance (PERFORMANCE)')
ssum = 0
for cat in target:
    ssum += stat_dic[cat]['Yes and implemented']
    print(cat, stat_dic[cat]['Yes and implemented'])

print('others', total - ssum)

{
    "Bad practice (BAD_PRACTICE)": {
        "No for regex limitation": 16,
        "No for multi-line": 52,
        "Yes but no time": 0,
        "Yes and implemented": 23
    },
    "Correctness (CORRECTNESS)": {
        "No for regex limitation": 58,
        "No for multi-line": 50,
        "Yes but no time": 0,
        "Yes and implemented": 37
    },
    "Experimental (EXPERIMENTAL)": {
        "No for regex limitation": 0,
        "No for multi-line": 3,
        "Yes but no time": 0,
        "Yes and implemented": 0
    },
    "Internationalization (I18N)": {
        "No for regex limitation": 0,
        "No for multi-line": 2,
        "Yes but no time": 0,
        "Yes and implemented": 0
    },
    "Malicious code vulnerability (MALICIOUS_CODE)": {
        "No for regex limitation": 4,
        "No for multi-line": 12,
        "Yes but no time": 0,
        "Yes and implemented": 1
    },
    "Multithreaded correctness (MT_CORRECTNESS)": {
        "No for regex limitation": 4,


In [3]:
# Inter-class Level
print('inter-class', len(df[df['inter-class'] == 'Y']))
print()

# Class Level
print('class total', 
      df[
              (
                  (df['class-import']=='Y') | (df['class-signature']=='Y') | (df['class-field']=='Y') | (df['class-body']=='Y')
              )
          ].shape[0]
     )
print('class-import', len(df[df['class-import'] == 'Y']))
print('class-signature', len(df[df['class-signature'] == 'Y']))
print('class-field', len(df[df['class-field'] == 'Y']))
print('class-body', len(df[df['class-body'] == 'Y']))
print()

# Method Level
print('method total', 
      df[
              (
                  (df['method-signature']=='Y') | (df['method-local']=='Y') | (df['method-field']=='Y') | (df['method-body']=='Y')
              )
          ].shape[0]
     )
print('method-signature', len(df[df['method-signature'] == 'Y']))
print('method-local', len(df[df['method-local'] == 'Y']))
print('method-field', len(df[df['method-field'] == 'Y']))
print('method-body', len(df[df['method-body'] == 'Y']))
print()

# Statement Level
print('statement', len(df[df['statement'] == 'Y']))

inter-class 41

class total 79
class-import 4
class-signature 14
class-field 49
class-body 48

method total 181
method-signature 60
method-local 20
method-field 21
method-body 149

statement 137


In [16]:
# ------- number of implemented patterns ------------
print('------- number of implemented patterns ------------')
# Inter-class Level
inter_total = df[((df['inter-class'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))].shape[0]
print('inter-class', inter_total)
print()

# Class Level
class_total = df[
              (
                  ((df['class-import']=='Y') | (df['class-signature']=='Y') | (df['class-field']=='Y') | (df['class-body']=='Y')) 
                  & (df['regex_feasibility'] == 'Yes and implemented')
              )
          ].shape[0]
print('class total', class_total)
print('class-import', len(df[((df['class-import'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))]))
print('class-signature', len(df[((df['class-signature'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))]))
print('class-field', len(df[((df['class-field'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))]))
print('class-body', len(df[((df['class-body'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))]))
print()

# Method Level
method_total = df[
              (
                  ((df['method-signature']=='Y') | (df['method-local']=='Y') | (df['method-field']=='Y') | (df['method-body']=='Y'))
                  & (df['regex_feasibility'] == 'Yes and implemented')
              )
          ].shape[0]
print('method total', method_total)
print('method-signature', len(df[((df['method-signature'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))]))
print('method-local', len(df[((df['method-local'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))]))
print('method-field', len(df[((df['method-field'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))]))
print('method-body', len(df[((df['method-body'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))]))
print()

# Statement Level
statement_total = df[((df['statement'] == 'Y') & (df['regex_feasibility'] == 'Yes and implemented'))].shape[0]
print('statement', statement_total)

print('total', inter_total + class_total + method_total + statement_total)

------- number of implemented patterns ------------
inter-class 0

class total 13
class-import 0
class-signature 5
class-field 8
class-body 1

method total 34
method-signature 8
method-local 8
method-field 3
method-body 16

statement 42
total 89


In [10]:
# Techniques
print('type', len(df[df['type'] == 'Y']))
print()

print('annotation', len(df[df['annotation'] == 'Y']))
print()

print('java-version', len(df[df['java-version'] == 'Y']))
print()

techs = ('call graph', 'data flow', 'control flow', 'inheritance graph')
for t in techs:
    print(t, len(df[df['techniques'].str.contains(t, regex=False, na=False)]))
    print()

type 194

annotation 22

java-version 5

call graph 14

data flow 41

control flow 41

inheritance graph 78



In [11]:
# ------- pattern number under each categories
total = 0
for cat in categories:
    num = len(df[df['category'] == cat])
    total += num
    print(cat, num)
    
print(total)


Bad practice (BAD_PRACTICE) 91
Correctness (CORRECTNESS) 148
Experimental (EXPERIMENTAL) 9
Internationalization (I18N) 2
Malicious code vulnerability (MALICIOUS_CODE) 17
Multithreaded correctness (MT_CORRECTNESS) 46
Bogus random noise (NOISE) 4
Performance (PERFORMANCE) 37
Security (SECURITY) 11
Dodgy code (STYLE) 86
451
