# **COMPARISON OF LDA AND CHI-SQUARE FEATURE SELECTION TECHNIQUES**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Code-smell-severity-classification-main/merged dataset_FE_LM_GC_DC.csv',
                    sep=',', encoding='iso-8859-1')

number_class = 13

# **DATA PREPROCESSING**


## **Transforming nominal categorical variables into ordinal categorical variabless**

In [None]:
df['modifier_type'].replace('abstract', 0.0, inplace=True)
df['modifier_type'].replace('final', 1.0, inplace=True)
df['modifier_type'].replace('other', 2.0, inplace=True)
df['visibility_type'].replace('public', 0.0, inplace=True)
df['visibility_type'].replace('private', 1.0, inplace=True)
df['visibility_type'].replace('protected', 2.0, inplace=True)
df['visibility_type'].replace('package', 3.0, inplace=True)

## **Predictor and Target Attributes**

In [None]:
predictors = df.iloc[:, 8:92].values
predictors_chi = df.iloc[:, [10,11,13,14,15,17,19,21,22,23,25,30,31,33,34,35,37,40,44,54,60,61,62,63,64]].values

In [None]:
target = df.iloc[:, 7].values


## **Data Scaling**

Standardization (uses the mean and standard deviation as a reference).

Normalization (uses maximum and minimum values as a reference).

In [None]:
from sklearn.preprocessing import StandardScaler
predictors_stand = StandardScaler().fit_transform(predictors)
predictors_chi_stand = StandardScaler().fit_transform(predictors_chi)

## **Dimensionality Reduction**

### **Linear Discriminant Analysis (LDA)**

Supervised learning algorithm, as it uses the class as a reference for selection.

Applied in situations with many predictive attributes and also with the target attribute with many classes.

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = number_class - 1)
predictors_lda = lda.fit_transform(predictors, target)

# **NAIVE BAYES**

:https://scikit-learn.org/stable/modules/naive_bayes.html

## **sklearn.naive_bayes.BernoulliNB**
Naive Bayes classifier for multivariate Bernoulli models.

Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolean features.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi, target, test_size = 0.3, random_state = 0)

from sklearn.naive_bayes import BernoulliNB
naive = BernoulliNB(force_alpha=True)
naive.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_naive = naive.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_naive) * 100.0))

Accuracy: 52.04%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = BernoulliNB(force_alpha=True)
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 66.68%
Standard Deviation: 3.87%


# **SUPPORT VECTOR MACHINES (SVM)**

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=1, C=1)
svm.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_svm = svm.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_svm) * 100.0))

Accuracy: 73.61%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = SVC(kernel='rbf', random_state=1, C = 1)
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 77.07%
Standard Deviation: 2.04%


# **LOGISTIC REGRESSION**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

***MULTINOMIAL LOGISTICS REGRESSION***

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’, ‘saga’ and ‘newton-cg’ solvers.)

The ‘newton-cg’, ‘sag’, and ‘lbfgs’ solvers support only L2 regularization with primal formulation, or no regularization. The ‘liblinear’ solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. The Elastic-Net regularization is only supported by the ‘saga’ solver.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(random_state=1, max_iter=1300, penalty="l2", tol=0.0001, multi_class="multinomial", C=1,solver="saga")
logistic.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_logistic = logistic.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_logistic) * 100.0))

Accuracy: 66.54%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = LogisticRegression(random_state=1, max_iter=1300, penalty="l2", tol=0.0001, multi_class="multinomial", C=1,solver="saga")
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 75.40%
Standard Deviation: 3.11%


# **K-NEAREST NEIGHBORS(KNN)**

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
knn.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_knn = knn.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_knn) * 100.0))

Accuracy: 73.98%


### **Validação Cruzada**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 1)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 74.62%
Standard Deviation: 3.16%


# **DECISION TREE**

https://scikit-learn.org/stable/modules/tree.html

**The disadvantages of decision trees include:**

* Decision-tree learners can create over-complex trees that do not generalize the data well. This is called *overfitting*. Mechanisms such as pruning, setting the minimum number of samples required at a leaf node or setting the maximum depth of the tree are necessary to avoid this problem.

* Decision trees can be unstable because small variations in the data might result in a completely different tree being generated. *This problem is mitigated by using decision trees within an ensemble*.

* Predictions of decision trees are neither smooth nor continuous, but piecewise constant approximations as seen in the above figure. Therefore, they are not good at extrapolation.

* The problem of learning an optimal decision tree is known to be NP-complete under several aspects of optimality and even for simple concepts. Consequently, practical decision-tree learning algorithms are based on heuristic algorithms such as the greedy algorithm where locally optimal decisions are made at each node. Such algorithms cannot guarantee to return the globally optimal decision tree. *This can be mitigated by training multiple trees in an ensemble learner, where the features and samples are randomly sampled with replacement*.

* There are concepts that are hard to learn because decision trees do not express them easily, such as XOR, parity or multiplexer problems.

* Decision tree learners *create biased trees if some classes dominate*. It is therefore recommended *to balance the dataset prior to fitting with the decision tree*

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = DecisionTreeClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    min_samples_split=[2,3,4,5],
    min_samples_leaf=[1,2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 5}


In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=7, min_samples_leaf= 2, min_samples_split = 5)
tree.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_tree = tree.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_tree) * 100.0))

Accuracy: 72.86%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=7, min_samples_leaf= 2, min_samples_split = 5)
result = cross_val_score(model, predictors, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 76.86%
Standard Deviation: 4.60%


# **RANDOM FOREST**

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    n_estimators=[50,100,150,200,250],
    min_samples_split=[2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=2, n_estimators=100)
random.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_random = random.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_random) * 100.0))

Accuracy: 79.55%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=2, n_estimators=100)
result = cross_val_score(model, predictors, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 79.88%
Standard Deviation: 2.83%


# **XGBOOST**

# **The first approach involving Standardization, Chi-square with XGBoost**

https://xgboost.readthedocs.io/en/stable/

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier(objective='multi:softprob', num_class=13, random_state=3)

param_grid = dict(
    n_estimators=[50, 100, 150, 200],
    max_depth=[1,2,3,4,5],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5]
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 100}


In [None]:
xg = XGBClassifier(learning_rate=0.5, max_depth=5, n_estimators=100, objective='multi:softprob', num_class=13, random_state=3)
xg.fit(x_train,y_train)


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_xg = xg.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_xg) * 100.0))

Accuracy: 81.04%


In [None]:
print(classification_report(y_test, predictions_xg))

              precision    recall  f1-score   support

         0.0       0.94      0.90      0.92       129
         1.0       0.75      0.50      0.60         6
         2.0       0.58      0.58      0.58        12
         3.0       0.50      1.00      0.67         1
         4.0       0.00      0.00      0.00         2
         5.0       0.81      0.76      0.79        29
         6.0       0.50      1.00      0.67         7
         7.0       0.50      0.50      0.50         8
         8.0       0.80      0.80      0.80        25
         9.0       0.64      0.75      0.69        12
        10.0       0.00      0.00      0.00         0
        11.0       0.60      0.30      0.40        10
        12.0       0.79      0.93      0.85        28

    accuracy                           0.81       269
   macro avg       0.57      0.62      0.57       269
weighted avg       0.82      0.81      0.81       269



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = XGBClassifier(learning_rate=0.5, max_depth=5, n_estimators=100, objective='multi:softprob', num_class=13, random_state=3)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 80.88%
Standard Deviation: 2.76%


# **CATBOOST**

https://catboost.ai/en/docs/

In [None]:
#Instalação
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = CatBoostClassifier(task_type='CPU', random_state = 5)

param_grid = dict(
    iterations=[50, 100, 150, 200],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5],
    depth=[4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1:	learn: 2.5170414	total: 681ms	remaining: 1m 7s
2:	learn: 2.4942607	total: 1.03s	remaining: 1m 7s
3:	learn: 2.4679457	total: 1.36s	remaining: 1m 6s
4:	learn: 2.4451975	total: 1.69s	remaining: 1m 5s
5:	learn: 2.4215510	total: 2.03s	remaining: 1m 5s
6:	learn: 2.3960125	total: 2.36s	remaining: 1m 5s
7:	learn: 2.3789982	total: 2.69s	remaining: 1m 4s
8:	learn: 2.3552784	total: 3.07s	remaining: 1m 5s
9:	learn: 2.3328448	total: 3.4s	remaining: 1m 4s
10:	learn: 2.3130030	total: 3.73s	remaining: 1m 4s
11:	learn: 2.2959246	total: 4.15s	remaining: 1m 5s
12:	learn: 2.2754782	total: 4.75s	remaining: 1m 8s
13:	learn: 2.2538969	total: 5.31s	remaining: 1m 10s
14:	learn: 2.2346195	total: 5.84s	remaining: 1m 12s
15:	learn: 2.2167304	total: 6.37s	remaining: 1m 13s
16:	learn: 2.1973631	total: 6.93s	remaining: 1m 14s
17:	learn: 2.1820085	total: 7.48s	remaining: 1m 15s
18:	learn: 2.1664145	total: 7.82s	remaining: 1m 14s
19:	learn: 2.1510455	

In [None]:
catboost = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.5, depth = 4, random_state = 5, eval_metric="Accuracy")
catboost.fit( x_train, y_train, plot=True, eval_set=(x_test, y_test))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_cat = catboost.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_cat) * 100.0))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6560000	test: 0.6542751	best: 0.6542751 (0)	total: 7.67ms	remaining: 1.52s
1:	learn: 0.6880000	test: 0.7063197	best: 0.7063197 (1)	total: 15.5ms	remaining: 1.54s
2:	learn: 0.6864000	test: 0.7063197	best: 0.7063197 (1)	total: 22.4ms	remaining: 1.47s
3:	learn: 0.7104000	test: 0.7286245	best: 0.7286245 (3)	total: 29.6ms	remaining: 1.45s
4:	learn: 0.7120000	test: 0.7323420	best: 0.7323420 (4)	total: 36.4ms	remaining: 1.42s
5:	learn: 0.7152000	test: 0.7286245	best: 0.7323420 (4)	total: 43.7ms	remaining: 1.41s
6:	learn: 0.7264000	test: 0.7472119	best: 0.7472119 (6)	total: 51.6ms	remaining: 1.42s
7:	learn: 0.7472000	test: 0.7472119	best: 0.7472119 (6)	total: 58.7ms	remaining: 1.41s
8:	learn: 0.7776000	test: 0.7434944	best: 0.7472119 (6)	total: 65.7ms	remaining: 1.39s
9:	learn: 0.7824000	test: 0.7472119	best: 0.7472119 (6)	total: 73.1ms	remaining: 1.39s
10:	learn: 0.7744000	test: 0.7546468	best: 0.7546468 (10)	total: 79.6ms	remaining: 1.37s
11:	learn: 0.7728000	test: 0.7546468	best

In [None]:
print(classification_report(y_test, predictions_cat))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92       129
         1.0       1.00      0.17      0.29         6
         2.0       0.55      0.50      0.52        12
         3.0       0.50      1.00      0.67         1
         4.0       0.00      0.00      0.00         2
         5.0       0.79      0.90      0.84        29
         6.0       0.64      1.00      0.78         7
         7.0       1.00      0.50      0.67         8
         8.0       0.77      0.80      0.78        25
         9.0       0.82      0.75      0.78        12
        10.0       0.00      0.00      0.00         0
        11.0       0.25      0.20      0.22        10
        12.0       0.76      0.89      0.82        28

    accuracy                           0.81       269
   macro avg       0.61      0.59      0.56       269
weighted avg       0.82      0.81      0.80       269



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.2, depth = 4, random_state = 5, eval_metric="Accuracy")
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

0:	learn: 0.6430348	total: 15.7ms	remaining: 3.13s
1:	learn: 0.6393035	total: 34.1ms	remaining: 3.38s
2:	learn: 0.6405473	total: 56.1ms	remaining: 3.68s
3:	learn: 0.6442786	total: 78ms	remaining: 3.82s
4:	learn: 0.6417910	total: 98.6ms	remaining: 3.85s
5:	learn: 0.6455224	total: 116ms	remaining: 3.76s
6:	learn: 0.6654229	total: 130ms	remaining: 3.6s
7:	learn: 0.6778607	total: 147ms	remaining: 3.53s
8:	learn: 0.6803483	total: 161ms	remaining: 3.42s
9:	learn: 0.7425373	total: 178ms	remaining: 3.38s
10:	learn: 0.7487562	total: 193ms	remaining: 3.32s
11:	learn: 0.7524876	total: 214ms	remaining: 3.35s
12:	learn: 0.7549751	total: 229ms	remaining: 3.29s
13:	learn: 0.7562189	total: 247ms	remaining: 3.28s
14:	learn: 0.7574627	total: 269ms	remaining: 3.32s
15:	learn: 0.7636816	total: 288ms	remaining: 3.31s
16:	learn: 0.7649254	total: 301ms	remaining: 3.24s
17:	learn: 0.7736318	total: 330ms	remaining: 3.33s
18:	learn: 0.7761194	total: 351ms	remaining: 3.35s
19:	learn: 0.7823383	total: 380ms	remai