# **COMPARISON OF LDA AND CHI-SQUARE FEATURE SELECTION TECHNIQUES**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Code-smell-severity-classification-main/merged dataset_FE_LM_GC_DC.csv',
                    sep=',', encoding='iso-8859-1')

number_class = 13

# **DATA PREPROCESSING**


## **Transforming nominal categorical variables into ordinal categorical variabless**

In [4]:
df['modifier_type'].replace('abstract', 0.0, inplace=True)
df['modifier_type'].replace('final', 1.0, inplace=True)
df['modifier_type'].replace('other', 2.0, inplace=True)
df['visibility_type'].replace('public', 0.0, inplace=True)
df['visibility_type'].replace('private', 1.0, inplace=True)
df['visibility_type'].replace('protected', 2.0, inplace=True)
df['visibility_type'].replace('package', 3.0, inplace=True)

## **Predictor and Target Attributes**

In [5]:
predictors = df.iloc[:, 8:92].values
predictors_chi = df.iloc[:, [10,11,13,14,15,17,19,21,22,23,25,30,31,33,34,35,37,40,44,54,60,61,62,63,64]].values

In [6]:
target = df.iloc[:, 7].values


## **Data Scaling**

Standardization (uses the mean and standard deviation as a reference).

Normalization (uses maximum and minimum values as a reference).

In [7]:
from sklearn.preprocessing import StandardScaler
predictors_stand = StandardScaler().fit_transform(predictors)
predictors_chi_stand = StandardScaler().fit_transform(predictors_chi)

## **Dimensionality Reduction**

### **Linear Discriminant Analysis (LDA)**

Supervised learning algorithm, as it uses the class as a reference for selection.

Applied in situations with many predictive attributes and also with the target attribute with many classes.

In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = number_class - 1)
predictors_lda = lda.fit_transform(predictors, target)

# **NAIVE BAYES**

:https://scikit-learn.org/stable/modules/naive_bayes.html

## **sklearn.naive_bayes.BernoulliNB**
Naive Bayes classifier for multivariate Bernoulli models.

Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolean features.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.naive_bayes import BernoulliNB
naive = BernoulliNB(force_alpha=True)
naive.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_naive = naive.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_naive) * 100.0))

Accuracy: 68.03%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 5)

model = BernoulliNB(force_alpha=True)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 64.54%
Standard Deviation: 5.43%


# **SUPPORT VECTOR MACHINES (SVM)**

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=1, C=1)
svm.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_svm = svm.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_svm) * 100.0))

Accuracy: 66.91%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 5)

model = SVC(kernel='rbf', random_state=1, C = 1)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 67.45%
Standard Deviation: 3.01%


# **LOGISTIC REGRESSION**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

***MULTINOMIAL LOGISTICS REGRESSION***

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’, ‘saga’ and ‘newton-cg’ solvers.)

The ‘newton-cg’, ‘sag’, and ‘lbfgs’ solvers support only L2 regularization with primal formulation, or no regularization. The ‘liblinear’ solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. The Elastic-Net regularization is only supported by the ‘saga’ solver.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(random_state=1, max_iter=1300, penalty="l2", tol=0.0001, multi_class="multinomial", C=1,solver="saga")
logistic.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_logistic = logistic.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_logistic) * 100.0))

Accuracy: 66.54%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 5)

model = LogisticRegression(random_state=1, max_iter=10000, penalty="l2", tol=0.0001, multi_class="multinomial", C=1,solver="saga")
result = cross_val_score(model, predictors_chi, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 57.39%
Standard Deviation: 4.23%


# **K-NEAREST NEIGHBORS(KNN)**

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
knn.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_knn = knn.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_knn) * 100.0))

Accuracy: 73.61%


### **Validação Cruzada**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 5)

model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 1)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 73.83%
Standard Deviation: 3.41%


# **DECISION TREE**

https://scikit-learn.org/stable/modules/tree.html

**The disadvantages of decision trees include:**

* Decision-tree learners can create over-complex trees that do not generalize the data well. This is called *overfitting*. Mechanisms such as pruning, setting the minimum number of samples required at a leaf node or setting the maximum depth of the tree are necessary to avoid this problem.

* Decision trees can be unstable because small variations in the data might result in a completely different tree being generated. *This problem is mitigated by using decision trees within an ensemble*.

* Predictions of decision trees are neither smooth nor continuous, but piecewise constant approximations as seen in the above figure. Therefore, they are not good at extrapolation.

* The problem of learning an optimal decision tree is known to be NP-complete under several aspects of optimality and even for simple concepts. Consequently, practical decision-tree learning algorithms are based on heuristic algorithms such as the greedy algorithm where locally optimal decisions are made at each node. Such algorithms cannot guarantee to return the globally optimal decision tree. *This can be mitigated by training multiple trees in an ensemble learner, where the features and samples are randomly sampled with replacement*.

* There are concepts that are hard to learn because decision trees do not express them easily, such as XOR, parity or multiplexer problems.

* Decision tree learners *create biased trees if some classes dominate*. It is therefore recommended *to balance the dataset prior to fitting with the decision tree*

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = DecisionTreeClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    min_samples_split=[2,3,4,5],
    min_samples_leaf=[1,2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=8, min_samples_leaf= 2, min_samples_split = 2)
tree.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_tree = tree.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_tree) * 100.0))

Accuracy: 72.12%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 5)

model = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=8, min_samples_leaf= 2, min_samples_split = 2)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 73.94%
Standard Deviation: 2.76%


# **RANDOM FOREST**

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    n_estimators=[50,100,150,200,250],
    min_samples_split=[2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro', cv=5)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_split': 4, 'n_estimators': 150}


In [None]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=4, n_estimators=150)
random.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_random = random.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_random) * 100.0))

Accuracy: 78.07%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 5)

model = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=4, n_estimators=150)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 81.21%
Standard Deviation: 1.50%


# **XGBOOST**

# **The first approach involving Standardization, Chi-square with XGBoost**

https://xgboost.readthedocs.io/en/stable/

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier(objective='multi:softprob', num_class=13, random_state=3)

param_grid = dict(
    n_estimators=[50, 100, 150, 200],
    max_depth=[1,2,3,4,5],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5]
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 100}


In [None]:
xg = XGBClassifier(learning_rate=0.5, max_depth=5, n_estimators=100, objective='multi:softprob', num_class=13, random_state=3)
xg.fit(x_train,y_train)


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_xg = xg.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_xg) * 100.0))

Accuracy: 81.04%


In [None]:
print(classification_report(y_test, predictions_xg))

              precision    recall  f1-score   support

         0.0       0.94      0.90      0.92       129
         1.0       0.75      0.50      0.60         6
         2.0       0.58      0.58      0.58        12
         3.0       0.50      1.00      0.67         1
         4.0       0.00      0.00      0.00         2
         5.0       0.81      0.76      0.79        29
         6.0       0.50      1.00      0.67         7
         7.0       0.50      0.50      0.50         8
         8.0       0.80      0.80      0.80        25
         9.0       0.64      0.75      0.69        12
        10.0       0.00      0.00      0.00         0
        11.0       0.60      0.30      0.40        10
        12.0       0.79      0.93      0.85        28

    accuracy                           0.81       269
   macro avg       0.57      0.62      0.57       269
weighted avg       0.82      0.81      0.81       269



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 5)

model = XGBClassifier(learning_rate=0.5, max_depth=5, n_estimators=100, objective='multi:softprob', num_class=13, random_state=3)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 80.43%
Standard Deviation: 4.06%


# **CATBOOST**

https://catboost.ai/en/docs/

In [9]:
#Instalação
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [20]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [21]:
from sklearn.model_selection import GridSearchCV
model = CatBoostClassifier(task_type='CPU', random_state = 5)

param_grid = dict(
    iterations=[50, 100, 150, 200],
    learning_rate=[0.1, 0.2, 0.5],
    depth=[4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1:	learn: 2.1101403	total: 753ms	remaining: 55.7s
2:	learn: 1.9986685	total: 1.1s	remaining: 53.7s
3:	learn: 1.8588183	total: 1.44s	remaining: 52.4s
4:	learn: 1.7701479	total: 1.79s	remaining: 52s
5:	learn: 1.6753302	total: 2.29s	remaining: 54.9s
6:	learn: 1.5854642	total: 2.98s	remaining: 1m
7:	learn: 1.5233451	total: 3.6s	remaining: 1m 3s
8:	learn: 1.4665265	total: 4.18s	remaining: 1m 5s
9:	learn: 1.4010562	total: 4.8s	remaining: 1m 7s
10:	learn: 1.3523034	total: 5.38s	remaining: 1m 8s
11:	learn: 1.3098048	total: 5.99s	remaining: 1m 8s
12:	learn: 1.2679585	total: 6.31s	remaining: 1m 6s
13:	learn: 1.2265257	total: 6.73s	remaining: 1m 5s
14:	learn: 1.1902841	total: 7.18s	remaining: 1m 4s
15:	learn: 1.1513143	total: 7.62s	remaining: 1m 3s
16:	learn: 1.1226351	total: 8.07s	remaining: 1m 3s
17:	learn: 1.0895948	total: 8.49s	remaining: 1m 2s
18:	learn: 1.0558875	total: 8.91s	remaining: 1m 1s
19:	learn: 1.0264620	total: 9.35s	

In [22]:
catboost = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.5, depth = 4, random_state = 5, eval_metric="Accuracy")
catboost.fit( x_train, y_train, plot=True, eval_set=(x_test, y_test))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_cat = catboost.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_cat) * 100.0))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6560000	test: 0.6542751	best: 0.6542751 (0)	total: 22.8ms	remaining: 4.55s
1:	learn: 0.6880000	test: 0.7063197	best: 0.7063197 (1)	total: 49.3ms	remaining: 4.88s
2:	learn: 0.6864000	test: 0.7063197	best: 0.7063197 (1)	total: 88.9ms	remaining: 5.84s
3:	learn: 0.7104000	test: 0.7286245	best: 0.7286245 (3)	total: 112ms	remaining: 5.5s
4:	learn: 0.7120000	test: 0.7323420	best: 0.7323420 (4)	total: 132ms	remaining: 5.15s
5:	learn: 0.7152000	test: 0.7286245	best: 0.7323420 (4)	total: 146ms	remaining: 4.7s
6:	learn: 0.7264000	test: 0.7472119	best: 0.7472119 (6)	total: 176ms	remaining: 4.84s
7:	learn: 0.7472000	test: 0.7472119	best: 0.7472119 (6)	total: 208ms	remaining: 4.99s
8:	learn: 0.7776000	test: 0.7434944	best: 0.7472119 (6)	total: 231ms	remaining: 4.89s
9:	learn: 0.7824000	test: 0.7472119	best: 0.7472119 (6)	total: 244ms	remaining: 4.64s
10:	learn: 0.7744000	test: 0.7546468	best: 0.7546468 (10)	total: 258ms	remaining: 4.44s
11:	learn: 0.7728000	test: 0.7546468	best: 0.754646

In [None]:
print(classification_report(y_test, predictions_cat))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92       129
         1.0       1.00      0.17      0.29         6
         2.0       0.55      0.50      0.52        12
         3.0       0.50      1.00      0.67         1
         4.0       0.00      0.00      0.00         2
         5.0       0.79      0.90      0.84        29
         6.0       0.64      1.00      0.78         7
         7.0       1.00      0.50      0.67         8
         8.0       0.77      0.80      0.78        25
         9.0       0.82      0.75      0.78        12
        10.0       0.00      0.00      0.00         0
        11.0       0.25      0.20      0.22        10
        12.0       0.76      0.89      0.82        28

    accuracy                           0.81       269
   macro avg       0.61      0.59      0.56       269
weighted avg       0.82      0.81      0.80       269



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **Cross-Validation**

In [23]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 5, shuffle=True, random_state = 5)

model = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.5, depth = 4, random_state = 5, eval_metric="Accuracy")
result = cross_val_score(model, predictors_chi, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

0:	learn: 0.6531469	total: 23.8ms	remaining: 4.74s
1:	learn: 0.6993007	total: 46.5ms	remaining: 4.6s
2:	learn: 0.7006993	total: 65.9ms	remaining: 4.33s
3:	learn: 0.7062937	total: 85.2ms	remaining: 4.17s
4:	learn: 0.7258741	total: 111ms	remaining: 4.34s
5:	learn: 0.7272727	total: 129ms	remaining: 4.17s
6:	learn: 0.7370629	total: 144ms	remaining: 3.98s
7:	learn: 0.7468531	total: 159ms	remaining: 3.81s
8:	learn: 0.7622378	total: 174ms	remaining: 3.69s
9:	learn: 0.7608392	total: 192ms	remaining: 3.64s
10:	learn: 0.7650350	total: 209ms	remaining: 3.58s
11:	learn: 0.7832168	total: 221ms	remaining: 3.46s
12:	learn: 0.7860140	total: 230ms	remaining: 3.31s
13:	learn: 0.7986014	total: 246ms	remaining: 3.26s
14:	learn: 0.8055944	total: 260ms	remaining: 3.2s
15:	learn: 0.8125874	total: 279ms	remaining: 3.21s
16:	learn: 0.8223776	total: 290ms	remaining: 3.12s
17:	learn: 0.8279720	total: 298ms	remaining: 3.01s
18:	learn: 0.8363636	total: 307ms	remaining: 2.93s
19:	learn: 0.8531469	total: 316ms	remai