In [14]:
import random
import math
import numpy as np
import pandas as pd
import os

from IPython.display import display
from sklearn.utils import resample
from sklearn import linear_model
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import NearestNeighbors

In [2]:
def get_data():
    path = os.getcwd()+'\\data'
    name = os.listdir(path)
    l = []
    for i in name:
        if '.csv' in i:
            l.append(pd.read_csv(path+'\\'+i,engine='python'))
        else:
            new_path = path+'\\'+i
            output = []
            for j in os.listdir(new_path):
                data = pd.read_csv(new_path+'\\'+j,engine='python')
                output.append(data)
            output = pd.concat(output,axis=0)
            l.append(output)
    return name,l

In [3]:
def smote(modules_input, ratio):
    modules, char, bug = seperateData(modules_input, -1)
    normal_modules, normal_char = seperateData(modules_input, 1)
    
    n = round((ratio*normal_modules.shape[0]-modules.shape[0])/modules.shape[0])
    k = 5
    
    if n<=0:
        return modules_input
    
    # 训练模型，取邻近的k个点（可修改邻近点数）
    neigh = NearestNeighbors(n_neighbors=k, algorithm='ball_tree', n_jobs=-1)
    neigh.fit(char)
    index = neigh.kneighbors(n_neighbors=k,return_distance=False)
    # result结果为narray类型的索引矩阵
    a, b = index.shape

    # 此处的用法详见书P83
    axis0, axis1 = np.ogrid[:a, :b]
    sort_axis = np.zeros(b,dtype=int)
    for i in range(a):
        temp = np.arange(b)
        # 从k个邻近中随机抽取n个邻近
        np.random.shuffle(temp)
        sort_axis = np.vstack((sort_axis,temp))
    # index_rand就是最终过采样得到矩阵的 下标
    sort_axis = sort_axis[1:]
    index_rand = index[axis0, sort_axis]

    flag = 0
    new_list = []
    for i in range(a):
        for j in range(n):
            p = index_rand[i][np.random.randint(0,k)]

#             p = index_rand[i][j]
            # 计算新的模块的各项特征
            new = char.iloc[i]+(char.iloc[p]-char.iloc[i])*np.random.rand()
            #计算原两个模块与新模块之间的欧氏距离
            d1 = np.linalg.norm(new-char.iloc[i])
            d2 = np.linalg.norm(new-char.iloc[p])
            if d1 == 0 and d2 == 0:
                break
            # 计算新模块的缺陷个数
            bug_new = (d2*modules.iloc[i].loc['bug']+d1*modules.iloc[p].loc['bug'])/(d1+d2)
            bug_new = float(round(bug_new))
            # 将新模块的各项特征和缺陷个数合并
            new['bug'] = bug_new
            new_list.append(new)
            flag += 1
    # 将缺陷模块数据集和正常模块数据集合并

    modules = pd.concat([modules,pd.concat(new_list,axis=1).T],axis=0)
    # modules_new的样式分为三部分，最上面时旧的缺陷数据集，中间时新合成的缺陷数据集，下面时正常数据集
    modules_new = pd.concat([modules, normal_modules], axis=0)
#     modules_new = modules_new.dropna(axis=0)
    
    return modules_new

In [4]:
def FPA(testBug, testPre):
    K = len(testBug)
    N = np.sum(testBug)
    
    sort_axis = np.argsort(testPre)
    testBug=np.array(testBug)
    testBug = testBug[sort_axis]
    P = sum(np.sum(testBug[m:])/N for m in range(K+1))/K
    return P

In [5]:
def RegressionModel(train_data_X, train_data_y, test_data_X):
    dtr = DecisionTreeRegressor().fit(train_data_X, train_data_y)
    lr = linear_model.LinearRegression().fit(train_data_X, train_data_y)
    bayes = BayesianRidge().fit(train_data_X, train_data_y)
    return dtr.predict(test_data_X),lr.predict(test_data_X),bayes.predict(test_data_X)

In [84]:
def bootstrap_resample(original_data):
    l = original_data.shape[0]
    original_data = original_data.set_index(np.arange(0,l))
#     index = np.random.randint(0,l,l)
    index0 = np.arange(0,l)
    index = resample(index0)
    train_data = original_data.iloc[index]
    test_data = original_data.drop(original_data.index[np.unique(index)])
    return train_data, test_data

In [85]:
def seperateData(modules, Type):
    if Type==-1:
        rare_modules = modules[modules.bug!=0]
        rare_char = rare_modules.iloc[:, :-1]
        rare_bug = rare_modules.iloc[:, -1]
        return rare_modules, rare_char, rare_bug
    elif Type==1:
        normal_modules = modules[modules.bug==0]    
        normal_char = normal_modules.iloc[:, :-1]
        return normal_modules, normal_char
    else:
        char = modules.iloc[:, :-1]
        bug = modules.iloc[:, -1]
        return char, bug

In [86]:
def NoSmote_NoBagging(dataset):
    train_data, test_data = bootstrap_resample(dataset)
    train_data_X, train_data_y = seperateData(train_data, 0)
    test_data_X, test_data_y = seperateData(test_data, 0)
    result = []
    for i in RegressionModel(train_data_X, train_data_y, test_data_X):
        result.append(FPA(test_data_y,i))
        reg = ['dtr','lr','bayes']
#     for (i,j) in zip(reg, output):
#         print('{:<6}: {:>8.4f}'.format(i,j))   
    return result

In [96]:
def NoSmote_Bagging(dataset):
    train_data, test_data = bootstrap_resample(dataset)
    train_data_X, train_data_y = seperateData(train_data, 0)
    test_data_X, test_data_y = seperateData(test_data, 0)
    result_sum = []
    for i in range(20):
        result = []
        for i in RegressionModel(train_data_X, train_data_y, test_data_X):
            result.append(FPA(test_data_y,i))
        result_sum.append(result)
    output = mean(list(zip(*result_sum)))
    reg = ['dtr','lr','bayes']
#     for (i,j) in zip(reg, output):
#         print('{:<6}: {:>8.4f}'.format(i,j))   
#     print('NoSmote:{}'.format(type(output)))
    return output

In [95]:
def Smote_NoBagging(dataset):
    train_data, test_data = bootstrap_resample(dataset)
    train_data = smote(train_data,1)
    train_data_X, train_data_y = seperateData(train_data, 0)
    test_data_X, test_data_y = seperateData(test_data, 0)
    result = []
    for i in RegressionModel(train_data_X, train_data_y, test_data_X):
        result.append(FPA(test_data_y,i))
    reg = ['dtr','lr','bayes']
#     for (i,j) in zip(reg, output):
#         print('{:<6}: {:>8.4f}'.format(i,j))   
    return result

In [89]:
def Smote_Bagging(dataset):
    train_data, test_data = bootstrap_resample(dataset)
    test_data_X, test_data_y = seperateData(test_data, 0)
    result_sum = []
    for i in range(20):
        train_data = smote(train_data,1)
        train_data_X, train_data_y = seperateData(train_data, 0)
        result = []
        for i in RegressionModel(train_data_X, train_data_y, test_data_X):
            result.append(FPA(test_data_y,i))
        result_sum.append(result)
    output = mean(list(zip(*result_sum)))
    reg = ['dtr','lr','bayes']
#     for (i,j) in zip(reg, output):
#         print('{:<6}: {:>8.4f}'.format(i,j))   
    return output

In [90]:
def mean(l):
    return list(sum(list(x))/len(x) for x in l)

In [91]:
SMOTERE1 = ['NoSmote_NoBagging','NoSmote_Bagging','Smote_NoBagging','Smote_Bagging']
def SMOTERE(n,dataset):
    if n==0:
        return NoSmote_NoBagging(dataset)
    elif n==1:
        return NoSmote_Bagging(dataset)
    elif n==2:
        return Smote_NoBagging(dataset)
    else:
        return Smote_Bagging(dataset)

In [97]:
colums = ['dataset','RAW','Bagging','SMOTE','SMOTE_Bagging']
pd.DataFrame(result_DTR,colums=columns=)

[0.70207399103138846, 0.80834626882833061, 0.80935236288375589]
[0.63090446946932344, 0.78965487061294726, 0.80097441433030914]
[0.60181821568956428, 0.80645441730782219, 0.81057148357209763]
[0.6562113787940651, 0.78260514495712785, 0.78826051449571277]
[0.68799709674599951, 0.74050077331438346, 0.74138998985381144]
[0.68389925575206234, 0.74668685884045816, 0.73834888362809359]
[0.66529154733054674, 0.71737350370859798, 0.71540537563339957]
[0.67478243487092926, 0.70910996574713347, 0.7109944136492794]
[0.84415584415584455, 0.89610389610389629, 0.88961038961038996]
[0.6795000000000001, 0.89000000000000024, 0.79500000000000004]
[0.80000000000000038, 0.82500000000000029, 0.87500000000000011]
[0.76400000000000012, 0.86400000000000055, 0.85600000000000009]
[0.77082856488899176, 0.76221675440604153, 0.78141451132982398]
[0.64927424108458598, 0.67197170645446525, 0.71577512525788289]
[0.72329059829059761, 0.73132183908046211, 0.73349543177129317]
[0.67390133925566975, 0.7853152971263192, 0

In [102]:
print(result_DTR)

[[0.70207399103138846, 0.63090446946932344, 0.60181821568956428, 0.6562113787940651], [0.68799709674599951, 0.68389925575206234, 0.66529154733054674, 0.67478243487092926], [0.84415584415584455, 0.6795000000000001, 0.80000000000000038, 0.76400000000000012], [0.77082856488899176, 0.64927424108458598, 0.72329059829059761, 0.67390133925566975], [0.70031555374592724, 0.78662472118719406, 0.77806404495521408, 0.77047167150225693], [0.67848101265822813, 0.66552982895285795, 0.64971287940935207, 0.72616509926854722], [0.62909486469142772, 0.64416466499253899, 0.61123654502494096, 0.6651806006493507], [0.66038781579851824, 0.66630033397025179, 0.67295360124864101, 0.67457500805336201], [0.60759916094584276, 0.61574773939253402, 0.60841694537346713, 0.68162571357656909], [0.54633896569380003, 0.49233706385780085, 0.63392592592592578, 0.64301639344262385], [0.56828249411236909, 0.61200401213116817, 0.66849978310454339, 0.6515971771576895], [0.55809016300011405, 0.52354372201751231, 0.533786323116

In [104]:
colums = ['RAW','Bagging','SMOTE','SMOTE_Bagging']
# pd.DataFrame(result_DTR,columns=colums).to_csv('DTR.csv')
# pd.DataFrame(result_LR,columns=colums).to_csv('LR.csv')
pd.DataFrame(result_BRR,columns=colums).to_csv('BRR.csv')