# **SUPERVISED LEARNING: CLASSIFICATION**

# **DATA PREPROCESSING**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Code-smell-severity-classification-main/merged dataset_FE_LM_GC_DC.csv',
                    sep=',', encoding='iso-8859-1')

## **Transforming nominal categorical variables into ordinal categorical variabless**

In [None]:
df['modifier_type'].replace('abstract', 0.0, inplace=True)
df['modifier_type'].replace('final', 1.0, inplace=True)
df['modifier_type'].replace('other', 2.0, inplace=True)
df['visibility_type'].replace('public', 0.0, inplace=True)
df['visibility_type'].replace('private', 1.0, inplace=True)
df['visibility_type'].replace('protected', 2.0, inplace=True)
df['visibility_type'].replace('package', 3.0, inplace=True)

## **Predictor and Target Attributes**

In [None]:
predictors = df.iloc[:, 8:92].values
predictors_chi = df.iloc[:, [10,11,13,14,15,17,19,21,22,23,25,30,31,33,34,35,37,40,44,54,60,61,62,63,64]].values

In [None]:
target = df.iloc[:, 5].values

## **Data Scaling**

Standardization (uses the mean and standard deviation as a reference).

Normalization (uses maximum and minimum values as a reference).

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler
predictors_stand = StandardScaler().fit_transform(predictors)
predictors_chi_stand = StandardScaler().fit_transform(predictors_chi)

## **Dimensionality Reduction**

### **Linear Discriminant Analysis (LDA)**

Supervised learning algorithm, as it uses the class as a reference for selection.

Applied in situations with many predictive attributes and also with the target attribute with many classes.

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = 1)

In [None]:
predictors_lda = lda.fit_transform(predictors, target)

In [None]:
lda.explained_variance_ratio_

array([1.])

In [None]:
# Soma das variáveis explicativas
lda.explained_variance_ratio_.sum()

1.0

# **NAIVE BAYES**

:https://scikit-learn.org/stable/modules/naive_bayes.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_naive = naive.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_naive) * 100.0))

Accuracy: 72.86%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = GaussianNB()
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 85.46%
Standard Deviation: 2.97%


# **SUPPORT VECTOR MACHINES (SVM)**

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=1, C=3)
svm.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_svm = svm.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_svm) * 100.0))

Accuracy: 86.99%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = SVC(kernel='rbf', random_state=1, C = 3)
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 87.47%
Standard Deviation: 3.51%


# **LOGISTIC REGRESSION**


https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

***REGRESSÃO LOGÍSTICA MULTINOMIAL***

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’, ‘saga’ and ‘newton-cg’ solvers.)

The ‘newton-cg’, ‘sag’, and ‘lbfgs’ solvers support only L2 regularization with primal formulation, or no regularization. The ‘liblinear’ solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. The Elastic-Net regularization is only supported by the ‘saga’ solver.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(random_state=1, max_iter=100, penalty="l2", tol=0.0001, C=1,solver="lbfgs")
logistic.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_logistic = logistic.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_logistic) * 100.0))

Accuracy: 81.04%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = LogisticRegression(random_state=1, max_iter=100, penalty="l2", tol=0.0001, C=1,solver="lbfgs")
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 87.47%
Standard Deviation: 3.21%


# **K-NEAREST NEIGHBORS(KNN)**

---



https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
knn.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_knn = knn.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_knn) * 100.0))

Accuracy: 88.85%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 1)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 89.94%
Standard Deviation: 3.41%


# **DECISION TREE**

https://scikit-learn.org/stable/modules/tree.html

**The disadvantages of decision trees include:**

* Decision-tree learners can create over-complex trees that do not generalize the data well. This is called *overfitting*. Mechanisms such as pruning, setting the minimum number of samples required at a leaf node or setting the maximum depth of the tree are necessary to avoid this problem.

* Decision trees can be unstable because small variations in the data might result in a completely different tree being generated. *This problem is mitigated by using decision trees within an ensemble*.

* Predictions of decision trees are neither smooth nor continuous, but piecewise constant approximations as seen in the above figure. Therefore, they are not good at extrapolation.

* The problem of learning an optimal decision tree is known to be NP-complete under several aspects of optimality and even for simple concepts. Consequently, practical decision-tree learning algorithms are based on heuristic algorithms such as the greedy algorithm where locally optimal decisions are made at each node. Such algorithms cannot guarantee to return the globally optimal decision tree. *This can be mitigated by training multiple trees in an ensemble learner, where the features and samples are randomly sampled with replacement*.

* There are concepts that are hard to learn because decision trees do not express them easily, such as XOR, parity or multiplexer problems.

* Decision tree learners *create biased trees if some classes dominate*. It is therefore recommended *to balance the dataset prior to fitting with the decision tree*

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = DecisionTreeClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    min_samples_split=[2,3,4,5],
    min_samples_leaf=[1,2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 2}


In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_leaf= 3, min_samples_split = 2)
tree.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_tree = tree.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_tree) * 100.0))

Accuracy: 88.85%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_leaf= 3, min_samples_split = 2)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 89.05%
Standard Deviation: 2.54%


# **RANDOM FOREST**

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    n_estimators=[50,100,150,200,250],
    min_samples_split=[2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 7, 'min_samples_split': 4, 'n_estimators': 50}


In [None]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=7, min_samples_split=4, n_estimators=50)
random.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_random = random.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_random) * 100.0))

Accuracy: 91.82%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=7, min_samples_split=4, n_estimators=50)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 92.85%
Standard Deviation: 1.80%


# **XGBOOST**

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

https://xgboost.readthedocs.io/en/stable/

In [None]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier(objective='binary:logistic', random_state=3)

param_grid = dict(
    n_estimators=[50, 100, 150, 200],
    max_depth=[1,2,3,4,5],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5]
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 100}


In [None]:
xg = XGBClassifier(learning_rate=0.5, max_depth=4, n_estimators=100, objective='binary:logistic', random_state=3)
xg.fit(x_train,y_train)


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_xg = xg.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_xg) * 100.0))

Accuracy: 92.94%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = XGBClassifier(learning_rate=0.5, max_depth=4, n_estimators=100, objective='binary:logistic', random_state=3)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 94.19%
Standard Deviation: 1.39%


# **CATBOOST**

https://catboost.ai/en/docs/

In [None]:
#Instalação
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = CatBoostClassifier(task_type='CPU', random_state = 5)

param_grid = dict(
    iterations=[50, 100, 150, 200],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5],
    depth=[4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1:	learn: 0.6730042	total: 9.52ms	remaining: 942ms
2:	learn: 0.6631502	total: 14.1ms	remaining: 929ms
3:	learn: 0.6539822	total: 18.9ms	remaining: 927ms
4:	learn: 0.6444783	total: 23.5ms	remaining: 916ms
5:	learn: 0.6351341	total: 28.1ms	remaining: 910ms
6:	learn: 0.6266913	total: 32.8ms	remaining: 904ms
7:	learn: 0.6181202	total: 37.5ms	remaining: 901ms
8:	learn: 0.6103373	total: 46.5ms	remaining: 987ms
9:	learn: 0.6021520	total: 52.8ms	remaining: 1s
10:	learn: 0.5940502	total: 57.5ms	remaining: 988ms
11:	learn: 0.5868518	total: 62.2ms	remaining: 975ms
12:	learn: 0.5790175	total: 66.5ms	remaining: 956ms
13:	learn: 0.5712160	total: 71ms	remaining: 944ms
14:	learn: 0.5640589	total: 75.5ms	remaining: 931ms
15:	learn: 0.5569003	total: 80.1ms	remaining: 921ms
16:	learn: 0.5498281	total: 84.7ms	remaining: 912ms
17:	learn: 0.5438335	total: 86.5ms	remaining: 874ms
18:	learn: 0.5370640	total: 91.5ms	remaining: 872ms
19:	learn: 0.

In [None]:
catboost = CatBoostClassifier(task_type='CPU', depth = 5, iterations=200, learning_rate=0.1, random_state = 5, eval_metric="Accuracy")
catboost.fit( x_train, y_train, plot=True, eval_set=(x_test, y_test))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_cat = catboost.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_cat) * 100.0))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8752000	test: 0.8773234	best: 0.8773234 (0)	total: 1.27ms	remaining: 253ms
1:	learn: 0.8752000	test: 0.8773234	best: 0.8773234 (0)	total: 3.73ms	remaining: 369ms
2:	learn: 0.8752000	test: 0.8773234	best: 0.8773234 (0)	total: 5.53ms	remaining: 363ms
3:	learn: 0.8752000	test: 0.8773234	best: 0.8773234 (0)	total: 7.34ms	remaining: 360ms
4:	learn: 0.8768000	test: 0.8773234	best: 0.8773234 (0)	total: 8.33ms	remaining: 325ms
5:	learn: 0.8768000	test: 0.8773234	best: 0.8773234 (0)	total: 10.3ms	remaining: 334ms
6:	learn: 0.8864000	test: 0.8736059	best: 0.8773234 (0)	total: 12.2ms	remaining: 337ms
7:	learn: 0.8864000	test: 0.8736059	best: 0.8773234 (0)	total: 13.2ms	remaining: 318ms
8:	learn: 0.8864000	test: 0.8736059	best: 0.8773234 (0)	total: 14.7ms	remaining: 312ms
9:	learn: 0.8864000	test: 0.8736059	best: 0.8773234 (0)	total: 15.8ms	remaining: 301ms
10:	learn: 0.8864000	test: 0.8736059	best: 0.8773234 (0)	total: 16.9ms	remaining: 291ms
11:	learn: 0.8864000	test: 0.8736059	best:

### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = CatBoostClassifier(task_type='CPU', depth = 8, iterations=150, learning_rate=0.1, random_state = 5, eval_metric="Accuracy")
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

0:	learn: 0.8930348	total: 11.6ms	remaining: 1.73s
1:	learn: 0.9154229	total: 21.3ms	remaining: 1.58s
2:	learn: 0.9166667	total: 32.3ms	remaining: 1.58s
3:	learn: 0.9253731	total: 41.7ms	remaining: 1.52s
4:	learn: 0.9353234	total: 51.2ms	remaining: 1.49s
5:	learn: 0.9465174	total: 60.6ms	remaining: 1.45s
6:	learn: 0.9378109	total: 69.9ms	remaining: 1.43s
7:	learn: 0.9490050	total: 79.1ms	remaining: 1.4s
8:	learn: 0.9502488	total: 88.4ms	remaining: 1.38s
9:	learn: 0.9552239	total: 97.6ms	remaining: 1.37s
10:	learn: 0.9527363	total: 99.3ms	remaining: 1.25s
11:	learn: 0.9539801	total: 111ms	remaining: 1.27s
12:	learn: 0.9527363	total: 120ms	remaining: 1.27s
13:	learn: 0.9527363	total: 129ms	remaining: 1.25s
14:	learn: 0.9589552	total: 139ms	remaining: 1.25s
15:	learn: 0.9577114	total: 148ms	remaining: 1.24s
16:	learn: 0.9564677	total: 157ms	remaining: 1.23s
17:	learn: 0.9589552	total: 167ms	remaining: 1.22s
18:	learn: 0.9639303	total: 176ms	remaining: 1.21s
19:	learn: 0.9651741	total: 185