# **COMPARISON OF NORMALIZATION AND STANDARDIZATION TECHNIQUES**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Code-smell-severity-classification-main/merged dataset_FE_LM_GC_DC.csv',
                    sep=',', encoding='iso-8859-1')

number_class = 13

# **DATA PREPROCESSING**

## **Transforming nominal categorical variables into ordinal categorical variabless**

In [None]:
df2 = pd.DataFrame.copy(df)

In [None]:
df2['modifier_type'].replace('abstract', 0.0, inplace=True)
df2['modifier_type'].replace('final', 1.0, inplace=True)
df2['modifier_type'].replace('other', 2.0, inplace=True)
df2['visibility_type'].replace('public', 0.0, inplace=True)
df2['visibility_type'].replace('private', 1.0, inplace=True)
df2['visibility_type'].replace('protected', 2.0, inplace=True)
df2['visibility_type'].replace('package', 3.0, inplace=True)

## **Predictor and Target Attributes**

In [None]:
predictors = df2.iloc[:, 8:92].values

In [None]:
target = df2.iloc[:, 7].values

## **Data Scaling**

Standardization (uses the mean and standard deviation as a reference).

Normalization (uses maximum and minimum values as a reference).

In [None]:
from sklearn.preprocessing import StandardScaler
predictors_stand = StandardScaler().fit_transform(predictors)

In [None]:
from sklearn.preprocessing import Normalizer
predictors_norm = Normalizer().fit_transform(predictors)

# **NAIVE BAYES**

:https://scikit-learn.org/stable/modules/naive_bayes.html

## **sklearn.naive_bayes.BernoulliNB**
Naive Bayes classifier for multivariate Bernoulli models.

Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolean features.

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.DistanceMetric.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

from sklearn.naive_bayes import BernoulliNB
naive = BernoulliNB(force_alpha=True)
naive.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_naive = naive.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_naive) * 100.0))

Accuracy: 54.65%


# **SUPPORT VECTOR MACHINES (SVM)**

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=1, C=1)
svm.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_svm = svm.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_svm) * 100.0))

Accuracy: 68.77%


# **LOGISTIC REGRESSION**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

***MULTINOMIAL LOGISTICS REGRESSION***

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’, ‘saga’ and ‘newton-cg’ solvers.)

The ‘newton-cg’, ‘sag’, and ‘lbfgs’ solvers support only L2 regularization with primal formulation, or no regularization. The ‘liblinear’ solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. The Elastic-Net regularization is only supported by the ‘saga’ solver.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

from sklearn.linear_model import LogisticRegression

logistica = LogisticRegression(random_state=1, max_iter=10000, penalty="l2", tol=0.0001,
                               multi_class="multinomial", C=1,solver="saga")
logistica.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_logistic = logistica.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_logistic) * 100.0))

Accuracy: 71.38%


# **K-NEAREST NEIGHBORS(KNN)**

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
knn.fit(x_train, y_train)


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_knn = knn.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_knn) * 100.0))

Accuracy: 65.80%


# **DECISION TREE**

https://scikit-learn.org/stable/modules/tree.html

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    min_samples_split=[2,3,4,5],
    min_samples_leaf=[1,2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

# Configuring the search with Grid search
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')

# Configuring the best hyperparameters
grid_search.fit(x_train, y_train)

# The best hyperparameters
print(grid_search.best_params_)

{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 5}


In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=7, min_samples_leaf= 2, min_samples_split = 5)
tree.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_tree = tree.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_tree) * 100.0))


Accuracy: 72.86%


In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Separating data into folds
kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

from sklearn.tree import DecisionTreeClassifier

# Creating the model
model = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=7, min_samples_leaf= 2, min_samples_split = 5)
result = cross_val_score(model, predictors_stand, target, cv = kfold)

# We use the mean accuracy and standard deviation
print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 76.97%
Standard Deviation: 4.66%


# **RANDOM FOREST**

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    n_estimators=[50,100,150,200,250],
    min_samples_split=[2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

# Configuring the search with Grid search
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')

# Configuring the best hyperparameters
grid_search.fit(x_train, y_train)

# The best hyperparameters
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_split': 4, 'n_estimators': 50}


In [None]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=2, n_estimators=100)
random.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_random = random.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_random) * 100.0))

Accuracy: 79.55%


In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Separating data into folds
kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

from sklearn.ensemble import RandomForestClassifier

# Creating the model
model = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=4, n_estimators=50)
result = cross_val_score(model, predictors_stand, target, cv = kfold)

# We use the mean accuracy and standard deviation
print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 79.99%
Standard Deviation: 3.16%


# **XGBOOST**

https://xgboost.readthedocs.io/en/stable/

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = XGBClassifier(objective='multi:softprob', num_class=13, random_state=3)

param_grid = dict(
    n_estimators=[50, 100, 150, 200],
    max_depth=[1,2,3],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5]
    )

# Configuring the search with Grid search
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')

# Configuring the best hyperparameters
grid_search.fit(x_train, y_train)

# The best hyperparameters
print(grid_search.best_params_)

{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}


In [None]:
xg = XGBClassifier(learning_rate=0.2, max_depth=3, n_estimators=200, objective='multi:softprob', num_class=13, random_state=3)
xg.fit(x_train,y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_xg = xg.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_xg) * 100.0))

print(classification_report(y_test, predictions_xg))

confusion_matrix(y_test, predictions_xg)

Accuracy: 79.18%
              precision    recall  f1-score   support

         0.0       0.92      0.91      0.91       129
         1.0       0.50      0.17      0.25         6
         2.0       0.50      0.50      0.50        12
         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         2
         5.0       0.73      0.76      0.75        29
         6.0       0.54      1.00      0.70         7
         7.0       0.60      0.38      0.46         8
         8.0       0.76      0.76      0.76        25
         9.0       0.69      0.75      0.72        12
        10.0       0.00      0.00      0.00         0
        11.0       0.60      0.30      0.40        10
        12.0       0.76      0.93      0.84        28

    accuracy                           0.79       269
   macro avg       0.51      0.50      0.48       269
weighted avg       0.79      0.79      0.78       269



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[117,   0,   1,   1,   0,   3,   1,   2,   0,   0,   1,   0,   3],
       [  2,   1,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   1,   6,   0,   0,   3,   2,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0],
       [  1,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0],
       [  2,   0,   2,   0,   0,  22,   3,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   7,   0,   0,   0,   0,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   3,   3,   0,   0,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   0,  19,   4,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   3,   9,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   3,   5],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,  26]])

### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
# Separating data into folds
kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

In [None]:
# Creating the model
model = XGBClassifier(learning_rate=0.2, max_depth=3, n_estimators=200, objective='multi:softprob', num_class=13, random_state=3)
result = cross_val_score(model, predictors_stand, target, cv = kfold)
result


array([0.78888889, 0.77777778, 0.83333333, 0.77777778, 0.83146067,
       0.86516854, 0.82022472, 0.78651685, 0.87640449, 0.83146067])

In [None]:
# We use the mean accuracy and standard deviation
print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 81.89%
Standard Deviation: 3.36%


# **CATBOOST**

https://catboost.ai/en/docs/

In [None]:
#Instalação
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = CatBoostClassifier(task_type='CPU', random_state = 5)

param_grid = dict(
    iterations=[100, 150, 200],
    learning_rate=[0.1, 0.2, 0.5],
    depth=[4,5,6,7],
    )

# Configuring the search with Grid search
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')

# Configuring the best hyperparameters
grid_search.fit(x_train, y_train)

# The best hyperparameters
print(grid_search.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1:	learn: 2.0706412	total: 174ms	remaining: 12.8s
2:	learn: 1.9369232	total: 259ms	remaining: 12.7s
3:	learn: 1.8076430	total: 283ms	remaining: 10.3s
4:	learn: 1.7011177	total: 382ms	remaining: 11.1s
5:	learn: 1.6190203	total: 468ms	remaining: 11.2s
6:	learn: 1.5397580	total: 559ms	remaining: 11.4s
7:	learn: 1.4609331	total: 670ms	remaining: 11.9s
8:	learn: 1.4018353	total: 761ms	remaining: 11.9s
9:	learn: 1.3549692	total: 845ms	remaining: 11.8s
10:	learn: 1.3162477	total: 937ms	remaining: 11.8s
11:	learn: 1.2773732	total: 1.02s	remaining: 11.8s
12:	learn: 1.2316494	total: 1.11s	remaining: 11.7s
13:	learn: 1.1914325	total: 1.2s	remaining: 11.7s
14:	learn: 1.1521953	total: 1.29s	remaining: 11.6s
15:	learn: 1.1212605	total: 1.38s	remaining: 11.6s
16:	learn: 1.0857291	total: 1.48s	remaining: 11.6s
17:	learn: 1.0533581	total: 1.56s	remaining: 11.5s
18:	learn: 1.0208652	total: 1.67s	remaining: 11.5s
19:	learn: 0.9960447	total:

In [None]:
from catboost import CatBoostClassifier
catboost = CatBoostClassifier(task_type='CPU', depth = 6, iterations=200, learning_rate=0.2, random_state = 5, eval_metric="Accuracy")
catboost.fit( x_train, y_train, plot=True, eval_set=(x_test, y_test))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_cat = catboost.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_cat) * 100.0))

print(classification_report(y_test, predictions_cat))

confusion_matrix(y_test, predictions_cat)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6880000	test: 0.6654275	best: 0.6654275 (0)	total: 103ms	remaining: 20.5s
1:	learn: 0.6656000	test: 0.6579926	best: 0.6654275 (0)	total: 202ms	remaining: 20s
2:	learn: 0.6720000	test: 0.6765799	best: 0.6765799 (2)	total: 310ms	remaining: 20.3s
3:	learn: 0.6864000	test: 0.6802974	best: 0.6802974 (3)	total: 404ms	remaining: 19.8s
4:	learn: 0.6928000	test: 0.6914498	best: 0.6914498 (4)	total: 500ms	remaining: 19.5s
5:	learn: 0.7312000	test: 0.7323420	best: 0.7323420 (5)	total: 597ms	remaining: 19.3s
6:	learn: 0.7424000	test: 0.7397770	best: 0.7397770 (6)	total: 688ms	remaining: 19s
7:	learn: 0.7504000	test: 0.7286245	best: 0.7397770 (6)	total: 780ms	remaining: 18.7s
8:	learn: 0.7616000	test: 0.7434944	best: 0.7434944 (8)	total: 856ms	remaining: 18.2s
9:	learn: 0.7680000	test: 0.7472119	best: 0.7472119 (9)	total: 955ms	remaining: 18.1s
10:	learn: 0.7824000	test: 0.7695167	best: 0.7695167 (10)	total: 1.06s	remaining: 18.3s
11:	learn: 0.7920000	test: 0.7732342	best: 0.7732342 (11

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[117,   0,   2,   0,   0,   3,   1,   1,   1,   1,   1,   2],
       [  1,   1,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   7,   0,   0,   3,   2,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0],
       [  1,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0],
       [  2,   0,   3,   0,   0,  24,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   1,   6,   0,   0,   0,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   3,   3,   0,   0,   0],
       [  1,   0,   0,   0,   0,   0,   0,   1,  21,   2,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   3,   9,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   4,   4],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,  25]])

### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Separating data into folds
kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

# Creating the model
model = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.2, depth = 6, random_state = 5, eval_metric="Accuracy")
result = cross_val_score(model, predictors_stand, target, cv = kfold)

# We use the mean accuracy and standard deviation
print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation:: %.2f%%" % (result.std() * 100.0))

In [None]:
# Separating data into folds
kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

In [None]:
# Creating the model
model = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.2, depth = 6, random_state = 5, eval_metric="Accuracy")
result = cross_val_score(model, predictors_stand, target, cv = kfold)

# We use the mean accuracy and standard deviation
print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation:: %.2f%%" % (result.std() * 100.0))

0:	learn: 0.6206468	total: 260ms	remaining: 51.7s
1:	learn: 0.6641791	total: 470ms	remaining: 46.5s
2:	learn: 0.6977612	total: 695ms	remaining: 45.7s
3:	learn: 0.7139303	total: 875ms	remaining: 42.9s
4:	learn: 0.7201493	total: 1.06s	remaining: 41.5s
5:	learn: 0.7313433	total: 1.29s	remaining: 41.9s
6:	learn: 0.7524876	total: 1.53s	remaining: 42.2s
7:	learn: 0.7524876	total: 1.69s	remaining: 40.5s
8:	learn: 0.7562189	total: 1.88s	remaining: 39.8s
9:	learn: 0.7611940	total: 2.09s	remaining: 39.8s
10:	learn: 0.7649254	total: 2.35s	remaining: 40.4s
11:	learn: 0.7674129	total: 2.59s	remaining: 40.5s
12:	learn: 0.7748756	total: 2.85s	remaining: 41.1s
13:	learn: 0.7798507	total: 3.06s	remaining: 40.7s
14:	learn: 0.7885572	total: 3.28s	remaining: 40.5s
15:	learn: 0.7947761	total: 3.5s	remaining: 40.3s
16:	learn: 0.7972637	total: 3.64s	remaining: 39.2s
17:	learn: 0.8097015	total: 3.86s	remaining: 39s
18:	learn: 0.8159204	total: 3.98s	remaining: 38s
19:	learn: 0.8196517	total: 4.14s	remaining: 3