In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

np.set_printoptions(threshold=np.inf)

In [18]:
# Importing the dataset
dataset = pd.read_csv('input/bugzilla.csv')
X = dataset.iloc[:, 2 :-1].values # independent variables
y = dataset.iloc[:, -1].values # dependent variable

In [13]:
# Selecting features #1
# Removing features with low variance
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.85 * (1 - .85)))
X = sel.fit_transform(X)

In [59]:
# Selecting features #2
# Removing high-correlated columns

# Convert X into DataFrame
df = pd.DataFrame(X)

# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.8
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
print(to_drop)

# Drop features 
df.drop(to_drop, axis=1, inplace=True)

# Convert dataframe back to X
X = df.to_numpy()

[1, 12, 13]


In [204]:
# Selecting features #3
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
print(X.shape)
X_new = SelectKBest(chi2, k=10).fit_transform(X, y)
print(X_new.shape)

(4620, 11)
(4620, 10)


In [19]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [20]:
# Feature Scaling
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
X_train = pt.fit_transform(X_train)
X_test = pt.transform(X_test)

In [21]:
# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, max_iter=10000)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=0)

In [22]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [23]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Precision: {:2f} %".format(precision_score(y_test, y_pred)*100))
print("Recall: {:2f} %".format(recall_score(y_test, y_pred)*100))
print("F1 score: {:2f} %".format(f1_score(y_test, y_pred)*100))

# shows how many correct and incorrect predictions we made (0/1)

[[503  89]
 [161 171]]
Accuracy: 72.943723 %
Precision: 65.769231 %
Recall: 51.506024 %
F1 score: 57.770270 %


In [25]:
# Computing the accuracy with cross-validate
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn import svm

kf = KFold(n_splits=10, random_state=0, shuffle=True)

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc']
accuracies = cross_validate(estimator = classifier, X = X, y = y, cv = 10, scoring=scoring, return_train_score=True)
#print(sorted(accuracies.keys()))
print("Train_accuracy: {:2f} %".format(accuracies['train_accuracy'].mean()*100))
print("Train_recall_macro: {:2f} %".format(accuracies['train_recall_macro'].mean()*100))
print("Train_precision_macro: {:2f} %".format(accuracies['train_precision_macro'].mean()*100))
print("Train_f1_macro: {:2f} %".format(accuracies['train_f1_macro'].mean()*100))
print("Train_roc_auc: {:2f} %".format(accuracies['train_roc_auc'].mean()*100))


Train_accuracy: 71.635402 %
Train_recall_macro: 64.655812 %
Train_precision_macro: 71.412749 %
Train_f1_macro: 65.032426 %
Train_roc_auc: 75.393416 %


In [59]:
# Computing the accuracy with KFold
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
kf = KFold(n_splits=10, random_state=0, shuffle=True)

classifier = LogisticRegression(random_state = 0, max_iter=10000)

acc_score = []
rec_score = []
prec_score = []
f1 = []
roc_auc = []

for train_index, test_index in kf.split(X):
    X_train , X_test = X[train_index], X[test_index]
    y_train , y_test = y[train_index] , y[test_index]
    
    classifier.fit(X_train, y_train)
    
    #y_pred = classifier.predict(X_test)
    
    THRESHOLD = 0.5
    
    y_pred_new = np.where(classifier.predict_proba(X_test)[:,1] >= THRESHOLD, 1, 0)
    
    # Calculate metrics
    acc_score.append(accuracy_score(y_test, y_pred_new)*100)
    prec_score.append(precision_score(y_test, y_pred_new)*100)
    rec_score.append(recall_score(y_test, y_pred_new)*100)
    f1.append(f1_score(y_test, y_pred_new)*100)
    roc_auc.append(roc_auc_score(y_test, y_pred_new)*100)


avg_accuracy = (sum(acc_score)/10)
print('Accuracy: {:2f} %'.format(avg_accuracy))

avg_recall = (sum(rec_score)/10)
print('Recall: {:2f} %'.format(avg_recall))

avg_precision = (sum(prec_score)/10)
print('Precision: {:2f} %'.format(avg_precision))

avg_f1 = (sum(f1)/10)
print('F1: {:2f} %'.format(avg_f1))

avg_roc_auc = (sum(roc_auc)/10)
print('Roc_auc: {:2f} %'.format(avg_roc_auc))

Accuracy: 71.580087 %
Recall: 38.133386 %
Precision: 71.144124 %
F1: 49.562018 %
Roc_auc: 64.565574 %


In [15]:
# Computing the accuracy with cross_val_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

kf = KFold(n_splits=10, random_state=0, shuffle=True)

accuracy = cross_val_score(classifier, X_train, y_train, cv=kf, scoring='accuracy')
print('Accuracy: {:2f} %'.format(accuracy.mean()*100))

recall = cross_val_score(classifier, X_train, y_train, cv=kf, scoring='recall')
print("Recall: {:2f} %".format(recall.mean()*100))

precision = cross_val_score(classifier, X_train, y_train, cv=kf, scoring='precision')
print("Precision: {:2f} %".format(precision.mean()*100))

f1 = cross_val_score(classifier, X_train, y_train, cv=kf, scoring='f1')
print("F1: {:2f} %".format(f1.mean()*100))

roc_auc = cross_val_score(classifier, X_train, y_train, cv=kf, scoring='roc_auc')
print("Roc_auc: {:2f} %".format(roc_auc.mean()*100))

Accuracy: 70.949490 %
Recall: 38.906498 %
Precision: 70.739195 %
F1: 49.437622 %
Roc_auc: 75.005684 %
