# **COMPARISON OF NORMALIZATION AND STANDARDIZATION TECHNIQUES**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Code smell severity/merged dataset_FE_LM_GC_DC.csv',
                    sep=',', encoding='iso-8859-1')

number_class = 13

# **DATA PREPROCESSING**

## **Transforming nominal categorical variables into ordinal categorical variabless**

In [27]:
df2 = pd.DataFrame.copy(df)

In [28]:
df2['modifier_type'].replace('abstract', 0.0, inplace=True)
df2['modifier_type'].replace('final', 1.0, inplace=True)
df2['modifier_type'].replace('other', 2.0, inplace=True)
df2['visibility_type'].replace('public', 0.0, inplace=True)
df2['visibility_type'].replace('private', 1.0, inplace=True)
df2['visibility_type'].replace('protected', 2.0, inplace=True)
df2['visibility_type'].replace('package', 3.0, inplace=True)

## **Predictor and Target Attributes**

In [29]:
predictors = df2.iloc[:, 8:92].values

In [30]:
target = df2.iloc[:, 7].values

## **Data Scaling**

Standardization (uses the mean and standard deviation as a reference).

Normalization (uses maximum and minimum values as a reference).

In [31]:
from sklearn.preprocessing import StandardScaler
predictors_stand = StandardScaler().fit_transform(predictors)

In [32]:
from sklearn.preprocessing import Normalizer
predictors_norm = Normalizer().fit_transform(predictors)

# **NAIVE BAYES**

:https://scikit-learn.org/stable/modules/naive_bayes.html

## **sklearn.naive_bayes.BernoulliNB**
Naive Bayes classifier for multivariate Bernoulli models.

Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolean features.

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.DistanceMetric.html

In [41]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

from sklearn.naive_bayes import BernoulliNB
naive = BernoulliNB(force_alpha=True)
naive.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_naive = naive.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_naive) * 100.0))

Acurácia: 60.97%


# **SUPPORT VECTOR MACHINES (SVM)**

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [42]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=1, C=1)
svm.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_svm = svm.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_svm) * 100.0))

Accuracy: 67.66%


# **LOGISTIC REGRESSION**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

***MULTINOMIAL LOGISTICS REGRESSION***

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’, ‘saga’ and ‘newton-cg’ solvers.)

The ‘newton-cg’, ‘sag’, and ‘lbfgs’ solvers support only L2 regularization with primal formulation, or no regularization. The ‘liblinear’ solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. The Elastic-Net regularization is only supported by the ‘saga’ solver.

In [46]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

from sklearn.linear_model import LogisticRegression

logistica = LogisticRegression(random_state=1, max_iter=2000, penalty="l2", tol=0.0001,
                               multi_class="multinomial", C=1,solver="saga")
logistica.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_logistic = logistica.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_logistic) * 100.0))

Accuracy: 72.49%


# **K-NEAREST NEIGHBORS(KNN)**

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [49]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
knn.fit(x_train, y_train)


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_knn = knn.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_knn) * 100.0))

Accuracy: 67.29%


# **DECISION TREE**

https://scikit-learn.org/stable/modules/tree.html

In [50]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

In [51]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    min_samples_split=[2,3,4,5],
    min_samples_leaf=[1,2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

# Configuring the search with Grid search
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')

# Configuring the best hyperparameters
grid_search.fit(x_train, y_train)

# The best hyperparameters
print(grid_search.best_params_)

{'max_depth': 8, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [52]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=8, min_samples_leaf= 4, min_samples_split = 2)
tree.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_tree = tree.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_tree) * 100.0))


Accuracy: 73.23%


# **RANDOM FOREST**

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [53]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    n_estimators=[50,100,150,200,250],
    min_samples_split=[2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

# Configuring the search with Grid search
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')

# Configuring the best hyperparameters
grid_search.fit(x_train, y_train)

# The best hyperparameters
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 150}


In [56]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=5, n_estimators=150)
random.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_random = random.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_random) * 100.0))

Accuracy: 77.70%


# **XGBOOST**

https://xgboost.readthedocs.io/en/stable/

In [58]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = XGBClassifier(objective='multi:softprob', num_class=13, random_state=3)

param_grid = dict(
    n_estimators=[50, 100, 150, 200],
    max_depth=[1,2,3],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5]
    )

# Configuring the search with Grid search
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')

# Configuring the best hyperparameters
grid_search.fit(x_train, y_train)

# The best hyperparameters
print(grid_search.best_params_)

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 150}


In [70]:
xg = XGBClassifier(learning_rate=0.05, max_depth=3, n_estimators=150, objective='multi:softprob', num_class=13, random_state=3)
xg.fit(x_train,y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_xg = xg.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_xg) * 100.0))

print(classification_report(y_test, predictors_xg))

confusion_matrix(y_test, predictors_xg)

Accuracy: 78.81%
              precision    recall  f1-score   support

         0.0       0.93      0.91      0.92       129
         1.0       1.00      0.33      0.50         6
         2.0       0.63      0.67      0.65        18
         3.0       0.80      1.00      0.89         4
         4.0       0.00      0.00      0.00         2
         5.0       0.67      0.70      0.68        23
         6.0       0.40      1.00      0.57         4
         7.0       0.60      0.38      0.46         8
         8.0       0.70      0.76      0.73        25
         9.0       0.62      0.67      0.64        12
        10.0       0.00      0.00      0.00         0
        11.0       0.33      0.10      0.15        10
        12.0       0.76      0.93      0.84        28

    accuracy                           0.79       269
   macro avg       0.57      0.57      0.54       269
weighted avg       0.79      0.79      0.78       269



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[117,   0,   2,   0,   0,   2,   2,   2,   1,   1,   0,   0,   2],
       [  3,   2,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,  12,   0,   0,   4,   2,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   4,   1,   0,  16,   2,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   4,   0,   0,   0,   0,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   3,   3,   0,   0,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   0,  19,   4,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   4,   8,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   6],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,  26]])

### **Cross-Validation**

In [62]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [63]:
# Separating data into folds
kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

In [65]:
# Creating the model
model = XGBClassifier(learning_rate=0.05, max_depth=3, n_estimators=150, objective='multi:softprob', num_class=13, random_state=3)
result = cross_val_score(model, predictors, target, cv = kfold)
result


array([0.73333333, 0.73333333, 0.77777778, 0.84444444, 0.78651685,
       0.87640449, 0.78651685, 0.79775281, 0.86516854, 0.86516854])

In [66]:
# We use the mean accuracy and standard deviation
print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 80.66%
Standard Deviation: 5.06%


# **CATBOOST**

https://catboost.ai/en/docs/

In [67]:
#Instalação
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = CatBoostClassifier(task_type='CPU', random_state = 5)

param_grid = dict(
    iterations=[100, 150, 200],
    learning_rate=[0.1, 0.2, 0.5],
    depth=[4,5,6,7],
    )

# Configuring the search with Grid search
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')

# Configuring the best hyperparameters
grid_search.fit(x_train, y_train)

# The best hyperparameters
print(grid_search.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
51:	learn: 0.7343989	total: 19.8s	remaining: 37.3s
52:	learn: 0.7204733	total: 20.1s	remaining: 36.8s
53:	learn: 0.7106934	total: 20.4s	remaining: 36.3s
54:	learn: 0.7021731	total: 20.8s	remaining: 35.9s
55:	learn: 0.6939378	total: 21.1s	remaining: 35.5s
56:	learn: 0.6839873	total: 21.5s	remaining: 35s
57:	learn: 0.6727083	total: 21.8s	remaining: 34.6s
58:	learn: 0.6623514	total: 22.1s	remaining: 34.1s
59:	learn: 0.6532110	total: 22.5s	remaining: 33.7s
60:	learn: 0.6456102	total: 22.8s	remaining: 33.3s
61:	learn: 0.6398739	total: 23.1s	remaining: 32.8s
62:	learn: 0.6343918	total: 23.5s	remaining: 32.4s
63:	learn: 0.6259860	total: 23.8s	remaining: 32s
64:	learn: 0.6168476	total: 24.1s	remaining: 31.6s
65:	learn: 0.6101741	total: 24.5s	remaining: 31.1s
66:	learn: 0.6021500	total: 24.8s	remaining: 30.7s
67:	learn: 0.5934971	total: 25.1s	remaining: 30.3s
68:	learn: 0.5868928	total: 25.4s	remaining: 29.9s
69:	learn: 0.5792601	

In [69]:
from catboost import CatBoostClassifier
catboost = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.2, depth = 5, random_state = 5, eval_metric="Accuracy")
catboost.fit( x_train, y_train, plot=True, eval_set=(x_test, y_test))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_cat = catboost.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_cat) * 100.0))

print(classification_report(y_test, predictors_cat))

confusion_matrix(y_test, predictors_cat)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6000000	test: 0.6133829	best: 0.6133829 (0)	total: 116ms	remaining: 23.1s
1:	learn: 0.6128000	test: 0.6133829	best: 0.6133829 (0)	total: 217ms	remaining: 21.5s
2:	learn: 0.6352000	test: 0.6245353	best: 0.6245353 (2)	total: 304ms	remaining: 20s
3:	learn: 0.6480000	test: 0.6245353	best: 0.6245353 (2)	total: 375ms	remaining: 18.4s
4:	learn: 0.6752000	test: 0.6691450	best: 0.6691450 (4)	total: 447ms	remaining: 17.4s
5:	learn: 0.6832000	test: 0.6542751	best: 0.6691450 (4)	total: 525ms	remaining: 17s
6:	learn: 0.6960000	test: 0.6617100	best: 0.6691450 (4)	total: 598ms	remaining: 16.5s
7:	learn: 0.7264000	test: 0.6877323	best: 0.6877323 (7)	total: 767ms	remaining: 18.4s
8:	learn: 0.7360000	test: 0.7100372	best: 0.7100372 (8)	total: 848ms	remaining: 18s
9:	learn: 0.7392000	test: 0.6988848	best: 0.7100372 (8)	total: 913ms	remaining: 17.3s
10:	learn: 0.7520000	test: 0.7174721	best: 0.7174721 (10)	total: 982ms	remaining: 16.9s
11:	learn: 0.7664000	test: 0.7174721	best: 0.7174721 (10)	

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[115,   0,   2,   0,   0,   4,   0,   1,   1,   2,   2,   2],
       [  2,   0,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,  14,   0,   0,   3,   1,   0,   0,   0,   0,   0],
       [  0,   0,   1,   3,   0,   0,   0,   0,   0,   0,   0,   0],
       [  1,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0],
       [  1,   0,   5,   1,   0,  16,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   4,   0,   0,   0,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   3,   3,   0,   0,   0],
       [  3,   0,   0,   0,   0,   0,   0,   0,  20,   2,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   2,  10,   0,   0],
       [  3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,   4],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,  27]])

### **Cross-Validation**

In [71]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [72]:
# Separating data into folds
kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

In [73]:
# Creating the model
model = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.2, depth = 5, random_state = 5, eval_metric="Accuracy")
result = cross_val_score(model, predictors, target, cv = kfold)

# We use the mean accuracy and standard deviation
print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation:: %.2f%%" % (result.std() * 100.0))

0:	learn: 0.6380597	total: 193ms	remaining: 38.4s
1:	learn: 0.6492537	total: 288ms	remaining: 28.5s
2:	learn: 0.7201493	total: 368ms	remaining: 24.2s
3:	learn: 0.7164179	total: 448ms	remaining: 21.9s
4:	learn: 0.7201493	total: 546ms	remaining: 21.3s
5:	learn: 0.7313433	total: 639ms	remaining: 20.6s
6:	learn: 0.7375622	total: 705ms	remaining: 19.4s
7:	learn: 0.7338308	total: 865ms	remaining: 20.8s
8:	learn: 0.7412935	total: 972ms	remaining: 20.6s
9:	learn: 0.7500000	total: 1.08s	remaining: 20.5s
10:	learn: 0.7599502	total: 1.2s	remaining: 20.7s
11:	learn: 0.7699005	total: 1.34s	remaining: 21s
12:	learn: 0.7736318	total: 1.45s	remaining: 20.8s
13:	learn: 0.7835821	total: 1.56s	remaining: 20.7s
14:	learn: 0.7848259	total: 1.71s	remaining: 21s
15:	learn: 0.7898010	total: 1.78s	remaining: 20.5s
16:	learn: 0.8009950	total: 1.86s	remaining: 20s
17:	learn: 0.8022388	total: 1.94s	remaining: 19.6s
18:	learn: 0.8097015	total: 2.02s	remaining: 19.3s
19:	learn: 0.8121891	total: 2.08s	remaining: 18.