In [137]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from chart_studio import plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import os

In [138]:
# print(os.listdir("../Software_Defect"))
data = pd.read_csv('firefox.csv')
data.drop('file', inplace=True, axis=1)

In [139]:
data['crashes']=data['crashes'].apply(lambda x:  bool(x))
defect_true_false = data.groupby('crashes').size()
print('False: ',defect_true_false[0])
print('True: ',defect_true_false[1])

False:  24482
True:  4268


In [140]:
trace = go.Histogram(
    x = data.crashes,
    opacity = 0.75,
    name = "crashes",
    marker = dict(color = 'green'))

hist_data = [trace]
hist_layout = go.Layout(barmode='overlay',
                   title = 'crashes',
                   xaxis = dict(title = 'True - False'),
                   yaxis = dict(title = 'Frequency'),
)
fig = go.Figure(data = hist_data, layout = hist_layout)
iplot(fig)

In [141]:
data.isnull().sum()

CountClassBase                      0
CountClassCoupled                   0
CountClassDerived                   0
CountDeclInstanceVariablePrivate    0
CountDeclMethod                     0
CountInput                          0
CountLine                           0
CountOutput                         0
Cyclomatic                          0
MaxInheritanceTree                  0
MaxNesting                          0
severity                            0
crashes                             0
dtype: int64

In [142]:
def evaluation_control(data):    
   # evaluation = (data.CountLine < 94) & (data.CountOutput < 396 ) & (data.CountClassCoupled < 29) & (data.CountDeclMethod < 325)
    evaluation=data.crashes==False& (data.CountLine < 94) & (data.CountOutput < 396 ) & (data.CountClassCoupled < 29) & (data.CountDeclMethod < 325)

    data['complexityEvaluation'] = pd.DataFrame(evaluation)
    data['complexityEvaluation'] = ['Succesful' if evaluation == True else 'Redesign' for evaluation in data.complexityEvaluation]

In [143]:
evaluation_control(data)
data

Unnamed: 0,CountClassBase,CountClassCoupled,CountClassDerived,CountDeclInstanceVariablePrivate,CountDeclMethod,CountInput,CountLine,CountOutput,Cyclomatic,MaxInheritanceTree,MaxNesting,severity,crashes,complexityEvaluation
0,16,0,324,0,42,0,0,86,33,0,0,0,True,Redesign
1,3,4,255,0,6,0,27,10,9,35,3,0,False,Succesful
2,71,0,3258,0,498,0,1,421,618,0,0,0,False,Succesful
3,0,2,142,0,1,0,17,3,9,16,1,0,False,Succesful
4,9,0,304,0,18,0,1,51,15,0,0,0,False,Succesful
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28745,0,0,44,0,0,0,0,0,0,0,0,0,False,Succesful
28746,67,0,2615,0,246,0,0,320,147,0,0,0,False,Succesful
28747,2,5,372,0,6,13,105,8,5,30,8,0,False,Succesful
28748,44,0,924,0,124,0,0,138,76,0,0,0,True,Redesign


In [144]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28750 entries, 0 to 28749
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   CountClassBase                    28750 non-null  int64 
 1   CountClassCoupled                 28750 non-null  int64 
 2   CountClassDerived                 28750 non-null  int64 
 3   CountDeclInstanceVariablePrivate  28750 non-null  int64 
 4   CountDeclMethod                   28750 non-null  int64 
 5   CountInput                        28750 non-null  int64 
 6   CountLine                         28750 non-null  int64 
 7   CountOutput                       28750 non-null  int64 
 8   Cyclomatic                        28750 non-null  int64 
 9   MaxInheritanceTree                28750 non-null  int64 
 10  MaxNesting                        28750 non-null  int64 
 11  severity                          28750 non-null  int64 
 12  crashes           

In [145]:
data.groupby("complexityEvaluation").size()

complexityEvaluation
Redesign      4268
Succesful    24482
dtype: int64

In [146]:
# Histogram
trace = go.Histogram(
    x = data.complexityEvaluation,
    opacity = 0.75,
    name = 'Complexity Evaluation',
    marker = dict(color = 'darkorange')
)
hist_data = [trace]
hist_layout = go.Layout(barmode='overlay',
                   title = 'Complexity Evaluation',
                   xaxis = dict(title = 'Succesful - Redesign'),
                   yaxis = dict(title = 'Frequency')
)
fig = go.Figure(data = hist_data, layout = hist_layout)
iplot(fig)

In [147]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28750 entries, 0 to 28749
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   CountClassBase                    28750 non-null  int64 
 1   CountClassCoupled                 28750 non-null  int64 
 2   CountClassDerived                 28750 non-null  int64 
 3   CountDeclInstanceVariablePrivate  28750 non-null  int64 
 4   CountDeclMethod                   28750 non-null  int64 
 5   CountInput                        28750 non-null  int64 
 6   CountLine                         28750 non-null  int64 
 7   CountOutput                       28750 non-null  int64 
 8   Cyclomatic                        28750 non-null  int64 
 9   MaxInheritanceTree                28750 non-null  int64 
 10  MaxNesting                        28750 non-null  int64 
 11  severity                          28750 non-null  int64 
 12  crashes           

In [148]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn import model_selection

X = data.iloc[:, 0:11].values  #Select related attribute values for selection leave one out to see which metrics are important here
Y = data.complexityEvaluation.values   #Select classification attribute values
X

array([[  16,    0,  324, ...,   33,    0,    0],
       [   3,    4,  255, ...,    9,   35,    3],
       [  71,    0, 3258, ...,  618,    0,    0],
       ...,
       [   2,    5,  372, ...,    5,   30,    8],
       [  44,    0,  924, ...,   76,    0,    0],
       [   0,    4,  104, ...,    5,   15,    5]])

In [149]:
Y

array(['Redesign', 'Succesful', 'Succesful', ..., 'Succesful', 'Redesign',
       'Succesful'], dtype=object)

In [150]:
#Parsing selection and verification datasets
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size = 0.1, random_state = seed)

In [151]:
from sklearn.linear_model import LogisticRegression

In [152]:
model = LogisticRegression()

In [153]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#Summary of the predictions made by the classifier
print("Logistic Regression")
#Accuracy score
from sklearn.metrics import accuracy_score
print("ACC: ",accuracy_score(y_pred,y_test))

Logistic Regression
ACC:  0.859304347826087



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

