# **COMPARISON OF LDA AND CHI-SQUARE FEATURE SELECTION TECHNIQUES**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Code smell severity/merged dataset_FE_LM_GC_DC.csv',
                    sep=',', encoding='iso-8859-1')

number_class = 13

# **DATA PREPROCESSING**


## **Transforming nominal categorical variables into ordinal categorical variabless**

In [None]:
df['modifier_type'].replace('abstract', 0.0, inplace=True)
df['modifier_type'].replace('final', 1.0, inplace=True)
df['modifier_type'].replace('other', 2.0, inplace=True)
df['visibility_type'].replace('public', 0.0, inplace=True)
df['visibility_type'].replace('private', 1.0, inplace=True)
df['visibility_type'].replace('protected', 2.0, inplace=True)
df['visibility_type'].replace('package', 3.0, inplace=True)

## **Predictor and Target Attributes**

In [None]:
predictors = df.iloc[:, 8:92].values
predictors_chi = df.iloc[:, [10,11,13,14,15,17,19,21,22,23,25,30,31,33,34,35,37,40,44,54,60,61,62,63,64]].values

In [None]:
target = df.iloc[:, 7].values


## **Data Scaling**

Standardization (uses the mean and standard deviation as a reference).

Normalization (uses maximum and minimum values as a reference).

In [None]:
from sklearn.preprocessing import StandardScaler
predictors_stand = StandardScaler().fit_transform(predictors)
predictors_chi_stand = StandardScaler().fit_transform(predictors_chi)

## **Dimensionality Reduction**

### **Linear Discriminant Analysis (LDA)**

Supervised learning algorithm, as it uses the class as a reference for selection.

Applied in situations with many predictive attributes and also with the target attribute with many classes.

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = number_class - 1)
predictors_lda = lda.fit_transform(predictors, target)

# **NAIVE BAYES**

:https://scikit-learn.org/stable/modules/naive_bayes.html

## **sklearn.naive_bayes.BernoulliNB**
Naive Bayes classifier for multivariate Bernoulli models.

Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolean features.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.naive_bayes import BernoulliNB
naive = BernoulliNB(force_alpha=True)
naive.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_naive = naive.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_naive) * 100.0))

Accuracy: 69.52%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = BernoulliNB(force_alpha=True)
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

[0.62222222 0.63333333 0.72222222 0.67777778 0.65168539 0.61797753
 0.66292135 0.66292135 0.64044944 0.69662921]
Mean Accuracy: 65.88%
Standard Deviation: 3.14%


# **SUPPORT VECTOR MACHINES (SVM)**

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=1, C=1)
svm.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_svm = svm.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_svm) * 100.0))

Accuracy: 77.70%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = SVC(kernel='rbf', random_state=1, C = 1)
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 76.52%
Standard Deviation: 4.04%


# **LOGISTIC REGRESSION**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

***MULTINOMIAL LOGISTICS REGRESSION***

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’, ‘saga’ and ‘newton-cg’ solvers.)

The ‘newton-cg’, ‘sag’, and ‘lbfgs’ solvers support only L2 regularization with primal formulation, or no regularization. The ‘liblinear’ solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. The Elastic-Net regularization is only supported by the ‘saga’ solver.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(random_state=1, max_iter=1500, penalty="l2", tol=0.0001, multi_class="multinomial", C=1,solver="saga")
logistic.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_logistic = logistic.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_logistic) * 100.0))

Accuracy: 76.58%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = LogisticRegression(random_state=1, max_iter=1500, penalty="l2", tol=0.0001, multi_class="multinomial", C=1,solver="saga")
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 77.08%
Standard Deviation: 3.66%


# **K-NEAREST NEIGHBORS(KNN)**

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
knn.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_knn = knn.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_knn) * 100.0))

Accuracy: 73.98%


### **Validação Cruzada**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 1)
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 74.17%
Standard Deviation: 4.53%


# **DECISION TREE**

https://scikit-learn.org/stable/modules/tree.html

**The disadvantages of decision trees include:**

* Decision-tree learners can create over-complex trees that do not generalize the data well. This is called *overfitting*. Mechanisms such as pruning, setting the minimum number of samples required at a leaf node or setting the maximum depth of the tree are necessary to avoid this problem.

* Decision trees can be unstable because small variations in the data might result in a completely different tree being generated. *This problem is mitigated by using decision trees within an ensemble*.

* Predictions of decision trees are neither smooth nor continuous, but piecewise constant approximations as seen in the above figure. Therefore, they are not good at extrapolation.

* The problem of learning an optimal decision tree is known to be NP-complete under several aspects of optimality and even for simple concepts. Consequently, practical decision-tree learning algorithms are based on heuristic algorithms such as the greedy algorithm where locally optimal decisions are made at each node. Such algorithms cannot guarantee to return the globally optimal decision tree. *This can be mitigated by training multiple trees in an ensemble learner, where the features and samples are randomly sampled with replacement*.

* There are concepts that are hard to learn because decision trees do not express them easily, such as XOR, parity or multiplexer problems.

* Decision tree learners *create biased trees if some classes dominate*. It is therefore recommended *to balance the dataset prior to fitting with the decision tree*

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_lda, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = DecisionTreeClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    min_samples_split=[2,3,4,5],
    min_samples_leaf=[1,2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 9, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=8, min_samples_leaf= 5, min_samples_split = 2)
tree.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_tree = tree.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_tree) * 100.0))

Accuracy: 73.61%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=8, min_samples_leaf= 5, min_samples_split = 2)
result = cross_val_score(model, predictors_lda, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 72.27%
Standard Deviation: 4.80%


# **RANDOM FOREST**

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(criterion='entropy', random_state = 0)

param_grid = dict(
    n_estimators=[50,100,150,200,250],
    min_samples_split=[2,3,4,5],
    max_depth=[1,2,3,4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 250}


In [None]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=2, n_estimators=250)
random.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_random = random.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_random) * 100.0))

Accuracy: 79.18%


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = RandomForestClassifier(criterion='entropy', random_state = 0, max_depth=10, min_samples_split=2, n_estimators=250)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 80.66%
Standard Deviation: 4.05%


# **XGBOOST**

# **The first approach involving Standardization, Chi-square with XGBoost**

https://xgboost.readthedocs.io/en/stable/

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors_chi_stand, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier(objective='multi:softprob', num_class=13, random_state=3)

param_grid = dict(
    n_estimators=[50, 100, 150, 200],
    max_depth=[1,2,3,4,5],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5]
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}


In [None]:
xg = XGBClassifier(learning_rate=0.2, max_depth=3, n_estimators=200, objective='multi:softprob', num_class=13, random_state=3)
xg.fit(x_train,y_train)


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_xg = xg.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_xg) * 100.0))

Accuracy: 79.55%


In [None]:
print(classification_report(y_test, predictors_xg))

              precision    recall  f1-score   support

         0.0       0.94      0.91      0.92       129
         1.0       1.00      0.67      0.80         6
         2.0       0.73      0.61      0.67        18
         3.0       0.75      0.75      0.75         4
         4.0       0.00      0.00      0.00         2
         5.0       0.67      0.70      0.68        23
         6.0       0.36      1.00      0.53         4
         7.0       0.50      0.50      0.50         8
         8.0       0.79      0.76      0.78        25
         9.0       0.69      0.75      0.72        12
        10.0       0.00      0.00      0.00         0
        11.0       0.40      0.20      0.27        10
        12.0       0.76      0.89      0.82        28

    accuracy                           0.80       269
   macro avg       0.58      0.59      0.57       269
weighted avg       0.81      0.80      0.80       269



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = XGBClassifier(learning_rate=0.2, max_depth=3, n_estimators=200, objective='multi:softprob', num_class=13, random_state=3)
result = cross_val_score(model, predictors_chi_stand, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

Mean Accuracy: 79.32%
Standard Deviation: 5.15%


# **CATBOOST**

https://catboost.ai/en/docs/

In [None]:
#Instalação
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.model_selection import GridSearchCV
model = CatBoostClassifier(task_type='CPU', random_state = 5)

param_grid = dict(
    iterations=[50, 100, 150, 200],
    learning_rate=[0.01, 0.05, 0.1, 0.2, 0.5],
    depth=[4,5,6,7,8,9,10],
    )

grid_search = GridSearchCV(model, param_grid, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1:	learn: 2.5090890	total: 505ms	remaining: 50s
2:	learn: 2.4904599	total: 764ms	remaining: 50.1s
3:	learn: 2.4623343	total: 1.03s	remaining: 50.3s
4:	learn: 2.4385012	total: 1.28s	remaining: 49.8s
5:	learn: 2.4147362	total: 1.53s	remaining: 49.5s
6:	learn: 2.3887587	total: 1.79s	remaining: 49.3s
7:	learn: 2.3720713	total: 2.05s	remaining: 49.2s
8:	learn: 2.3512635	total: 2.3s	remaining: 48.9s
9:	learn: 2.3273914	total: 2.56s	remaining: 48.5s
10:	learn: 2.3061787	total: 2.81s	remaining: 48.3s
11:	learn: 2.2886599	total: 3.07s	remaining: 48.1s
12:	learn: 2.2652156	total: 3.32s	remaining: 47.8s
13:	learn: 2.2496140	total: 3.57s	remaining: 47.5s
14:	learn: 2.2329897	total: 3.84s	remaining: 47.3s
15:	learn: 2.2155102	total: 4.1s	remaining: 47.2s
16:	learn: 2.2002132	total: 4.36s	remaining: 46.9s
17:	learn: 2.1836712	total: 4.61s	remaining: 46.6s
18:	learn: 2.1646406	total: 4.89s	remaining: 46.6s
19:	learn: 2.1476960	total: 5.

In [None]:
catboost = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.2, depth = 5, random_state = 5, eval_metric="Accuracy")
catboost.fit( x_train, y_train, plot=True, eval_set=(x_test, y_test))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictors_cat = catboost.predict(x_test)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictors_cat) * 100.0))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6000000	test: 0.6133829	best: 0.6133829 (0)	total: 54.8ms	remaining: 10.9s
1:	learn: 0.6128000	test: 0.6133829	best: 0.6133829 (0)	total: 88.7ms	remaining: 8.78s
2:	learn: 0.6352000	test: 0.6245353	best: 0.6245353 (2)	total: 123ms	remaining: 8.1s
3:	learn: 0.6480000	test: 0.6245353	best: 0.6245353 (2)	total: 156ms	remaining: 7.65s
4:	learn: 0.6752000	test: 0.6691450	best: 0.6691450 (4)	total: 190ms	remaining: 7.41s
5:	learn: 0.6832000	test: 0.6542751	best: 0.6691450 (4)	total: 226ms	remaining: 7.29s
6:	learn: 0.6960000	test: 0.6617100	best: 0.6691450 (4)	total: 265ms	remaining: 7.3s
7:	learn: 0.7264000	test: 0.6877323	best: 0.6877323 (7)	total: 300ms	remaining: 7.2s
8:	learn: 0.7360000	test: 0.7100372	best: 0.7100372 (8)	total: 334ms	remaining: 7.09s
9:	learn: 0.7392000	test: 0.6988848	best: 0.7100372 (8)	total: 367ms	remaining: 6.98s
10:	learn: 0.7520000	test: 0.7174721	best: 0.7174721 (10)	total: 402ms	remaining: 6.9s
11:	learn: 0.7664000	test: 0.7174721	best: 0.7174721 (

### **Cross-Validation**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 10, shuffle=True, random_state = 5)

model = CatBoostClassifier(task_type='CPU', iterations=200, learning_rate=0.2, depth = 5, random_state = 5, eval_metric="Accuracy")
result = cross_val_score(model, predictors, target, cv = kfold)

print("Mean Accuracy: %.2f%%" % (result.mean() * 100.0))
print("Standard Deviation: %.2f%%" % (result.std() * 100.0))

0:	learn: 0.6380597	total: 46.6ms	remaining: 9.28s
1:	learn: 0.6492537	total: 79.3ms	remaining: 7.85s
2:	learn: 0.7201493	total: 116ms	remaining: 7.6s
3:	learn: 0.7164179	total: 160ms	remaining: 7.82s
4:	learn: 0.7201493	total: 192ms	remaining: 7.48s
5:	learn: 0.7313433	total: 225ms	remaining: 7.26s
6:	learn: 0.7375622	total: 260ms	remaining: 7.17s
7:	learn: 0.7338308	total: 297ms	remaining: 7.13s
8:	learn: 0.7412935	total: 328ms	remaining: 6.96s
9:	learn: 0.7500000	total: 361ms	remaining: 6.86s
10:	learn: 0.7599502	total: 394ms	remaining: 6.78s
11:	learn: 0.7699005	total: 426ms	remaining: 6.67s
12:	learn: 0.7736318	total: 458ms	remaining: 6.58s
13:	learn: 0.7835821	total: 493ms	remaining: 6.55s
14:	learn: 0.7848259	total: 538ms	remaining: 6.63s
15:	learn: 0.7898010	total: 575ms	remaining: 6.62s
16:	learn: 0.8009950	total: 607ms	remaining: 6.54s
17:	learn: 0.8022388	total: 641ms	remaining: 6.48s
18:	learn: 0.8097015	total: 673ms	remaining: 6.41s
19:	learn: 0.8121891	total: 712ms	remain