#### Import the libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pycaret.classification import *

In [4]:
df = pd.read_csv('../data/final/defects_smells.csv', index_col=0)
df_type = df['Type']
df.shape

(19085, 85)

#### Remove duplicates

In [3]:
duplicate_rows = df[df.duplicated()]
len(duplicate_rows)

61

##### Drop the method-level code smells and the different targets that are not necessary for the model. For instance, this code is set up to the Refused Bequest model. To train other models, comment the Refused Bequest line, and remove the target that you want to model.

In [4]:
df = df.drop('LongName', 1)
df = df.drop('Name', 1)
df = df.drop('Parent', 1)
df = df.drop('Component', 1)
df = df.drop('Path', 1)
df = df.drop('Line', 1)
df = df.drop('EndLine', 1)
df = df.drop('Column', 1)
df = df.drop('EndColumn', 1)
df = df.drop('Type', 1)

df = df.drop('fe', 1)
df = df.drop('dico', 1)
df = df.drop('ic', 1)
df = df.drop('lpl', 1)
df = df.drop('mc', 1)
df = df.drop('ss', 1)
df = df.drop('lm', 1)

#df = df.drop('rb', 1)
df = df.drop('cdsbp', 1)
df = df.drop('dacl', 1)
df = df.drop('lc', 1)
df = df.drop('sc', 1)
df = df.drop('sg', 1)
df = df.drop('gc', 1)

df = df.drop('bug', 1)

#### Check the correlation and remove the high-correlated features from the dataset

In [5]:
corr = df.corr()
threshold = 0.99

columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= threshold:
            print(df.columns[i],df.columns[j])
            if columns[j]:
                columns[j] = False
selected_columns = df.columns[columns]
high_corr = set(df.columns) - set(selected_columns)
df = df[selected_columns]

LDC LLDC
CLOC TCLOC
NA TNA
NLPA TNLPA
NPA TNPA


In [6]:
df['Type'] = df_type

##### Put the target on the end of the dataframe. Here, as an example, our target is the presence of the Refused Bequest.

In [7]:
results = df.rb.values
df = df.drop('rb', axis=1)
df['rb'] = results

#### Setup the model creation with pycaret

In [8]:
s = setup(data=df, target='rb', feature_selection=True,
          remove_multicollinearity=True,
          multicollinearity_threshold=0.85,
          feature_selection_method='boruta',
          fix_imbalance=True, fold=10)

Unnamed: 0,Description,Value
0,session_id,3426
1,Target,rb
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(19085, 57)"
5,Missing Values,False
6,Numeric Features,51
7,Categorical Features,5
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
df = get_config('X')

#### Plot the features

In [11]:
selected_features = list(get_config('X').columns)
with open('../../results/features.csv', 'w') as f:
     f.write("\n".join(selected_features))

##### Train the models. In this code the model was trained for the Refused Bequest smell. From all models, we selected the top-5 in terms of F1.

In [12]:
top5_models = compare_models(n_select=5, sort='f1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9725,0.9524,0.6348,0.903,0.7441,0.7301,0.7435,1.161
rf,Random Forest Classifier,0.9718,0.9473,0.6219,0.9047,0.7357,0.7214,0.7363,1.454
lightgbm,Light Gradient Boosting Machine,0.9616,0.9153,0.5476,0.7827,0.6426,0.6231,0.6351,0.375
dt,Decision Tree Classifier,0.9433,0.7923,0.6195,0.5482,0.5808,0.5505,0.5522,0.153
gbc,Gradient Boosting Classifier,0.9287,0.8561,0.4876,0.4459,0.4651,0.4271,0.4279,3.436
knn,K Neighbors Classifier,0.843,0.8413,0.7373,0.25,0.3731,0.3075,0.3671,0.673
ada,Ada Boost Classifier,0.8461,0.7806,0.51,0.209,0.2963,0.2266,0.2554,0.646
lr,Logistic Regression,0.6582,0.7324,0.6796,0.1187,0.202,0.1052,0.1708,3.369
ridge,Ridge Classifier,0.648,0.0,0.6832,0.1159,0.1981,0.1004,0.1659,0.065
lda,Linear Discriminant Analysis,0.6479,0.7326,0.6832,0.1159,0.1981,0.1003,0.1658,0.181


#### Tune the models

In [13]:
tuned_top5 = [
  tune_model(
    i,
    n_iter=30,
    optimize="f1",
    search_library='optuna',
    choose_better=True,
    early_stopping=True)
  for i in top5_models
  ]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9783,0.9693,0.7412,0.9,0.8129,0.8015,0.8058
1,0.9716,0.9485,0.6706,0.8507,0.75,0.7351,0.741
2,0.9663,0.916,0.5529,0.8704,0.6763,0.6594,0.6784
3,0.9768,0.9302,0.6824,0.9355,0.7891,0.7772,0.788
4,0.9708,0.9158,0.6118,0.8966,0.7273,0.7124,0.727
5,0.9701,0.9143,0.5765,0.9245,0.7101,0.6953,0.7169
6,0.976,0.9315,0.6824,0.9206,0.7838,0.7714,0.7811
7,0.9805,0.9331,0.7294,0.9538,0.8267,0.8166,0.8248
8,0.9716,0.929,0.6118,0.9123,0.7324,0.718,0.734
9,0.976,0.9529,0.6548,0.9483,0.7746,0.7624,0.777


#### Ensemble the top-5 models

In [14]:
blended_models = blend_models(
  tuned_top5,
  choose_better=True,
  optimize="f1")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9783,0.9642,0.7294,0.9118,0.8105,0.7991,0.8047
1,0.9746,0.9511,0.6824,0.8923,0.7733,0.7601,0.7678
2,0.9656,0.9396,0.5529,0.8545,0.6714,0.6541,0.6714
3,0.979,0.9556,0.6824,0.9831,0.8056,0.7949,0.8097
4,0.9708,0.9465,0.6,0.9107,0.7234,0.7087,0.7259
5,0.9701,0.9378,0.5647,0.9412,0.7059,0.6911,0.7163
6,0.9768,0.9603,0.6824,0.9355,0.7891,0.7772,0.788
7,0.9783,0.9593,0.6706,0.9828,0.7972,0.7862,0.8022
8,0.9686,0.9551,0.5647,0.9057,0.6957,0.68,0.7012
9,0.973,0.9642,0.6071,0.9444,0.7391,0.7256,0.7454


In [15]:
pred = predict_model(blended_models)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.9742,0.9524,0.6313,0.903,0.7431,0.7299,0.7429


In [16]:
result = pull()

#### Plot the ensemble model

In [17]:
result.to_csv('../../results/models.csv')

#### Finalize and save the ensemble model

In [18]:
final_best = finalize_model(blended_models)
save_model(final_best, '../../results/rb/model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='rb',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy=...
                                             max_features=0.9779201704921807,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=3.746660669697568e-05,
                                        

#### Interpret the ensemble model

In [20]:
interpret_model(tuned_top5[0], save=True)
