# **APRENDIZAGEM SUPERVISIONADA: CLASSIFICAÇÃO**

# **PRÉ-PROCESSAMENTO**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Code smell severity/merged dataset_FE_LM_GC_DC.csv',
                    sep=',', encoding='iso-8859-1')

## **Transforming nominal categorical variables into ordinal categorical variabless**

In [5]:
df['modifier_type'].replace('abstract', 0.0, inplace=True)
df['modifier_type'].replace('final', 1.0, inplace=True)
df['modifier_type'].replace('other', 2.0, inplace=True)
df['visibility_type'].replace('public', 0.0, inplace=True)
df['visibility_type'].replace('private', 1.0, inplace=True)
df['visibility_type'].replace('protected', 2.0, inplace=True)
df['visibility_type'].replace('package', 3.0, inplace=True)

## **Predictor and Target Attributes**

In [6]:
predictors = df.iloc[:, 8:92].values
predictors_chi = df.iloc[:, [10,11,13,14,15,17,19,21,22,23,25,30,31,33,34,35,37,40,44,54,60,61,62,63,64]].values

In [7]:
target = df.iloc[:, 5].values

## **Data Scaling**

Standardization (uses the mean and standard deviation as a reference).

Normalization (uses maximum and minimum values as a reference).

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
from sklearn.preprocessing import StandardScaler
predictors_stand = StandardScaler().fit_transform(predictors)
predictors_chi_stand = StandardScaler().fit_transform(predictors_chi)

## **Dimensionality Reduction**

### **Linear Discriminant Analysis (LDA)**

Supervised learning algorithm, as it uses the class as a reference for selection.

Applied in situations with many predictive attributes and also with the target attribute with many classes.

In [10]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = 1)

In [11]:
predictors_lda = lda.fit_transform(predictors, target)

In [12]:
lda.explained_variance_ratio_

array([1.])

In [13]:
# Soma das variáveis explicativas
lda.explained_variance_ratio_.sum()

1.0

# **NAIVE BAYES**

:https://scikit-learn.org/stable/modules/naive_bayes.html

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train, y_train)
predictors_naive = naive.predict(x_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_naive = naive.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_naive) * 100.0))

Accuracy: 85.50%


### **Cross-Validation**

In [15]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = GaussianNB()
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 85.46%
Standard Deviation: 2.97%


# **SUPPORT VECTOR MACHINES (SVM)**

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=1, C=3)
svm.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_svm = svm.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_svm) * 100.0))

Accuracy: 87.73%


### **Cross-Validation**

In [17]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = SVC(kernel='rbf', random_state=1, C = 3)
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 87.47%
Standard Deviation: 3.51%


# **LOGISTIC REGRESSION**


https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

***REGRESSÃO LOGÍSTICA MULTINOMIAL***

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’, ‘saga’ and ‘newton-cg’ solvers.)

The ‘newton-cg’, ‘sag’, and ‘lbfgs’ solvers support only L2 regularization with primal formulation, or no regularization. The ‘liblinear’ solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. The Elastic-Net regularization is only supported by the ‘saga’ solver.

In [24]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(random_state=1, max_iter=500, penalty="l2", tol=0.0001, C=1,solver="lbfgs")
logistic.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_logistic = logistic.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_logistic) * 100.0))

Accuracy: 87.36%


### **Cross-Validation**

In [26]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = LogisticRegression(random_state=1, max_iter=100, penalty="l2", tol=0.0001, C=1,solver="lbfgs")
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 87.47%
Standard Deviation: 3.21%


# **K-NEAREST NEIGHBORS(KNN)**

---



https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
knn.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_knn = knn.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_knn) * 100.0))

Accuracy: 88.85%


### **Cross-Validation**

In [30]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 1)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 89.94%
Standard Deviation: 3.41%


# **DECISION TREE**

https://scikit-learn.org/stable/modules/tree.html

**The disadvantages of decision trees include:**

* Decision-tree learners can create over-complex trees that do not generalize the data well. This is called *overfitting*. Mechanisms such as pruning, setting the minimum number of samples required at a leaf node or setting the maximum depth of the tree are necessary to avoid this problem.

* Decision trees can be unstable because small variations in the data might result in a completely different tree being generated. *This problem is mitigated by using decision trees within an ensemble*.

* Predictions of decision trees are neither smooth nor continuous, but piecewise constant approximations as seen in the above figure. Therefore, they are not good at extrapolation.

* The problem of learning an optimal decision tree is known to be NP-complete under several aspects of optimality and even for simple concepts. Consequently, practical decision-tree learning algorithms are based on heuristic algorithms such as the greedy algorithm where locally optimal decisions are made at each node. Such algorithms cannot guarantee to return the globally optimal decision tree. *This can be mitigated by training multiple trees in an ensemble learner, where the features and samples are randomly sampled with replacement*.

* There are concepts that are hard to learn because decision trees do not express them easily, such as XOR, parity or multiplexer problems.

* Decision tree learners *create biased trees if some classes dominate*. It is therefore recommended *to balance the dataset prior to fitting with the decision tree*

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [32]:
from sklearn.model_selection import GridSearchCV
model = DecisionTreeClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    min_samples_split=[2,3,4,5],
    min_samples_leaf=[1,2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 2}


In [33]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_leaf= 3, min_samples_split = 2)
tree.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_tree = tree.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_tree) * 100.0))

Accuracy: 88.85%


### **Cross-Validation**

In [34]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_leaf= 3, min_samples_split = 2)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 89.05%
Standard Deviation: 2.54%


# **RANDOM FOREST**

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    n_estimators=[50,100,150,200,250],
    min_samples_split=[2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 7, 'min_samples_split': 4, 'n_estimators': 50}


In [35]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=7, min_samples_split=4, n_estimators=50)
random.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_random = random.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_random) * 100.0))

Accuracy: 91.82%


### **Cross-Validation**

In [36]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=7, min_samples_split=4, n_estimators=50)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 92.85%
Standard Deviation: 1.80%


# **XGBOOST**

In [38]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

https://xgboost.readthedocs.io/en/stable/

In [None]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier(objective='multi:softprob', num_class=13, random_state=3)

param_grid = dict(
    n_estimators=[50, 100, 150, 200],
    max_depth=[1,2,3,4,5],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5]
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

KeyboardInterrupt: ignored

In [39]:
xg = XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=200, objective='binary:logistic', random_state=3)
xg.fit(x_train,y_train)


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_xg = xg.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_xg) * 100.0))

Accuracy: 93.31%


### **Cross-Validation**

In [40]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=200, objective='binary:logistic', random_state=3)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 93.51%
Standard Deviation: 2.10%


# **CATBOOST**

https://catboost.ai/en/docs/

In [41]:
#Instalação
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [43]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = CatBoostClassifier(task_type='CPU', random_state = 5)

param_grid = dict(
    iterations=[50, 100, 150, 200],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5],
    depth=[4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
101:	learn: 0.2701705	total: 2.43s	remaining: 2.33s
102:	learn: 0.2685368	total: 2.45s	remaining: 2.31s
103:	learn: 0.2668094	total: 2.47s	remaining: 2.28s
104:	learn: 0.2654392	total: 2.49s	remaining: 2.26s
105:	learn: 0.2637636	total: 2.52s	remaining: 2.23s
106:	learn: 0.2623306	total: 2.55s	remaining: 2.21s
107:	learn: 0.2609441	total: 2.57s	remaining: 2.19s
108:	learn: 0.2592059	total: 2.59s	remaining: 2.16s
109:	learn: 0.2576600	total: 2.61s	remaining: 2.14s
110:	learn: 0.2563295	total: 2.63s	remaining: 2.11s
111:	learn: 0.2550055	total: 2.66s	remaining: 2.09s
112:	learn: 0.2532463	total: 2.68s	remaining: 2.06s
113:	learn: 0.2518106	total: 2.7s	remaining: 2.04s
114:	learn: 0.2503525	total: 2.72s	remaining: 2.01s
115:	learn: 0.2486740	total: 2.74s	remaining: 1.99s
116:	learn: 0.2476020	total: 2.79s	remaining: 1.98s
117:	learn: 0.2467826	total: 2.81s	remaining: 1.96s
118:	learn: 0.2452545	total: 2.84s	remaining: 1.93s


In [44]:
catboost = CatBoostClassifier(task_type='CPU', depth = 8, iterations=150, learning_rate=0.1, random_state = 5, eval_metric="Accuracy")
catboost.fit( x_train, y_train, plot=True, eval_set=(x_test, y_test))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_cat = catboost.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_cat) * 100.0))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.9088000	test: 0.8438662	best: 0.8438662 (0)	total: 78.6ms	remaining: 11.7s
1:	learn: 0.9200000	test: 0.8847584	best: 0.8847584 (1)	total: 107ms	remaining: 7.94s
2:	learn: 0.9216000	test: 0.8661710	best: 0.8847584 (1)	total: 140ms	remaining: 6.85s
3:	learn: 0.9440000	test: 0.8959108	best: 0.8959108 (3)	total: 163ms	remaining: 5.93s
4:	learn: 0.9408000	test: 0.8884758	best: 0.8959108 (3)	total: 188ms	remaining: 5.45s
5:	learn: 0.9456000	test: 0.8847584	best: 0.8959108 (3)	total: 224ms	remaining: 5.37s
6:	learn: 0.9472000	test: 0.9070632	best: 0.9070632 (6)	total: 251ms	remaining: 5.12s
7:	learn: 0.9552000	test: 0.9033457	best: 0.9070632 (6)	total: 286ms	remaining: 5.08s
8:	learn: 0.9584000	test: 0.9033457	best: 0.9070632 (6)	total: 344ms	remaining: 5.38s
9:	learn: 0.9584000	test: 0.9033457	best: 0.9070632 (6)	total: 389ms	remaining: 5.45s
10:	learn: 0.9584000	test: 0.9070632	best: 0.9070632 (6)	total: 424ms	remaining: 5.35s
11:	learn: 0.9616000	test: 0.9070632	best: 0.9070632

### **Cross-Validation**

In [46]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = CatBoostClassifier(task_type='CPU', depth = 8, iterations=150, learning_rate=0.1, random_state = 5, eval_metric="Accuracy")
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

0:	learn: 0.8930348	total: 27.7ms	remaining: 4.13s
1:	learn: 0.9154229	total: 42.6ms	remaining: 3.15s
2:	learn: 0.9166667	total: 58.4ms	remaining: 2.86s
3:	learn: 0.9253731	total: 82.6ms	remaining: 3.02s
4:	learn: 0.9353234	total: 102ms	remaining: 2.97s
5:	learn: 0.9465174	total: 126ms	remaining: 3.02s
6:	learn: 0.9378109	total: 151ms	remaining: 3.09s
7:	learn: 0.9490050	total: 174ms	remaining: 3.08s
8:	learn: 0.9502488	total: 195ms	remaining: 3.05s
9:	learn: 0.9552239	total: 221ms	remaining: 3.09s
10:	learn: 0.9527363	total: 226ms	remaining: 2.85s
11:	learn: 0.9539801	total: 247ms	remaining: 2.84s
12:	learn: 0.9527363	total: 267ms	remaining: 2.81s
13:	learn: 0.9527363	total: 288ms	remaining: 2.8s
14:	learn: 0.9589552	total: 302ms	remaining: 2.72s
15:	learn: 0.9577114	total: 334ms	remaining: 2.8s
16:	learn: 0.9564677	total: 365ms	remaining: 2.85s
17:	learn: 0.9589552	total: 378ms	remaining: 2.77s
18:	learn: 0.9639303	total: 390ms	remaining: 2.69s
19:	learn: 0.9651741	total: 405ms	remai