In [1]:
import pandas as pd

new_names = ('pattern', 'category', 'regex_feasibility',
             'inter-class', 'class-import', 'class-signature',
            'class-field', 'class-body', 'method-signature', 'method-local',
            'method-field', 'method-body', 'statement', 'type', 'annotation', 
            'java-version', 'techniques')

def load_data():
    return pd.read_csv('Study_of_Spotbugs.csv',
                       header=0, # Drop the existing header row
                       names=new_names, # Drop the existing header row
                       usecols = [i for i in range(19) if i not in (2, 4)],
                      )

In [2]:
df = load_data()

## 统计方法
### 方法1
```py
len(df[df['inter-class'] == 'Y'])
```
### 方法2
```py
inter_class_series = df.apply(lambda x: True if x['inter-class'] == 'Y' else False , axis=1)
print(len(inter_class_series[inter_class_series == True].index))
```

In [3]:
# Inter-class Level
print('inter-class', len(df[df['inter-class'] == 'Y']))
print()

# Class Level
print('class total', 
      df[
              (
                  (df['class-import']=='Y') | (df['class-signature']=='Y') | (df['class-field']=='Y') | (df['class-body']=='Y')
              )
          ].shape[0]
     )
print('class-import', len(df[df['class-import'] == 'Y']))
print('class-signature', len(df[df['class-signature'] == 'Y']))
print('class-field', len(df[df['class-field'] == 'Y']))
print('class-body', len(df[df['class-body'] == 'Y']))
print()

# Method Level
print('method total', 
      df[
              (
                  (df['method-signature']=='Y') | (df['method-local']=='Y') | (df['method-field']=='Y') | (df['method-body']=='Y')
              )
          ].shape[0]
     )
print('method-signature', len(df[df['method-signature'] == 'Y']))
print('method-local', len(df[df['method-local'] == 'Y']))
print('method-field', len(df[df['method-field'] == 'Y']))
print('method-body', len(df[df['method-body'] == 'Y']))
print()

# Statement Level
print('statement', len(df[df['statement'] == 'Y']))

inter-class 41

class total 79
class-import 4
class-signature 14
class-field 49
class-body 48

method total 181
method-signature 60
method-local 20
method-field 21
method-body 149

statement 137


In [4]:
# Techniques
print('type', len(df[df['type'] == 'Y']))
print()

print('annotation', len(df[df['annotation'] == 'Y']))
print()

print('java-version', len(df[df['java-version'] == 'Y']))
print()

techs = ('call graph', 'data flow', 'control flow', 'inheritance graph')
for t in techs:
    print(t, len(df[df['techniques'].str.contains(t, regex=False, na=False)]))
    print()

type 194

annotation 22

java-version 5

call graph 15

data flow 42

control flow 42

inheritance graph 79



In [5]:
# Regex feasibility in each category
categories = ('Bad practice (BAD_PRACTICE)', 'Correctness (CORRECTNESS)', 'Experimental (EXPERIMENTAL)',
             'Internationalization (I18N)', 'Malicious code vulnerability (MALICIOUS_CODE)', 
              'Multithreaded correctness (MT_CORRECTNESS)', 'Bogus random noise (NOISE)',
              'Performance (PERFORMANCE)', 'Security (SECURITY)', 'Dodgy code (STYLE)',
             )

labels = ('No for regex limitation', 'No for multi-line', 'Yes but no time', 'Yes and implemented')

total_cat_sum = 0
total_label_sum = 0
for cat in categories: 
    print(cat)
    label_sum = 0
    for label in labels:
        number = df[
                      (
                          (df['category']==cat) & (df['regex_feasibility']==label)
                      )
                  ].shape[0]
        label_sum += number
        print('\t' + label,  number)
        
    cat_sum = df[df['category']==cat].shape[0]
    total_cat_sum += cat_sum
    total_label_sum += label_sum
    if cat_sum != label_sum:
        print(f'\t------ category sum = {cat_sum}\tlabel_sum = {label_sum}\tdiff = {cat_sum - label_sum} ------')
    
print('total cat sum', total_cat_sum, 'total label sum', total_label_sum)

Bad practice (BAD_PRACTICE)
	No for regex limitation 16
	No for multi-line 51
	Yes but no time 2
	Yes and implemented 22
Correctness (CORRECTNESS)
	No for regex limitation 57
	No for multi-line 51
	Yes but no time 6
	Yes and implemented 31
	------ category sum = 148	label_sum = 145	diff = 3 ------
Experimental (EXPERIMENTAL)
	No for regex limitation 0
	No for multi-line 3
	Yes but no time 0
	Yes and implemented 0
	------ category sum = 9	label_sum = 3	diff = 6 ------
Internationalization (I18N)
	No for regex limitation 0
	No for multi-line 2
	Yes but no time 0
	Yes and implemented 0
Malicious code vulnerability (MALICIOUS_CODE)
	No for regex limitation 4
	No for multi-line 12
	Yes but no time 0
	Yes and implemented 1
Multithreaded correctness (MT_CORRECTNESS)
	No for regex limitation 4
	No for multi-line 37
	Yes but no time 3
	Yes and implemented 2
Bogus random noise (NOISE)
	No for regex limitation 0
	No for multi-line 0
	Yes but no time 0
	Yes and implemented 0
	------ category sum =