In [1]:
from IPython.display import HTML, display

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

from docplex.mp.model import Model


sns.set_style("darkgrid")

import pandas as pd
import numpy as np
import ast
import random
import math 
import time
import sys 

np.random.seed(123)

data = pd.read_csv("Breastdata.csv")

In [2]:
#Removing the Id and the Unnamed columns
data = data.iloc[:,1:-1]

#Next, we encode the Categorical Variable
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')

#Generating the correlation matrix
corr = data.corr()

print("Shape:",corr.shape[0])

Shape: 31


In [3]:
regressor_OLS = sm.OLS(data.iloc[:,0].values, data.iloc[:,1:].values).fit()
#print(regressor_OLS.summary())
maxVar = max(regressor_OLS.pvalues).astype(float)
#print(regressor_OLS.pvalues)

pValues = regressor_OLS.pvalues
pValues = np.append(0, pValues)
#pValues.append(regressor_OLS.pvalues)
#print(pValues)

In [4]:
def solveSPP(nFactors, Correlation, pValues):
    
    #nFactors = corr.shape[0] 
    #Correlation = corr
    N = [i for i in range(nFactors)] #set of clients N = {1..n}
    # SETS #
    ij_Range = [(i, j) for i in range(nFactors) for j in range(nFactors)]
    # Create A model
    mdl = Model('SPP')

    # Create one index binary variable
    #x = mdl.integer_var_list(nClusters, 0, 1, "x")
    x = mdl.binary_var_list(N, name="x")
    z = mdl.binary_var_matrix(N, N, name=lambda ns: "Z %s_%s" % (ns[0],ns[1]))


    # Create a single linear variable 
    MaxCorr = mdl.continuous_var(0, name="MaxCorr")


    # Add constraints stating that each customer is selected at least once
    mdl.minimize(mdl.sum(pValues[j]*x[j] for j in N))
    #mdl.minimize(mdl.sum(z[i,j]*Correlation.iloc[i,j] for i in N for j in N))
    mdl.add_constraint(x[0]==1)
    mdl.add_constraint(mdl.sum(x[j] for j in N)==22)
    mdl.add_constraint(MaxCorr<=0.9)
    mdl.add_constraints(x[i]+x[j]-z[i,j]<=1 for i in N for j in N)
    mdl.add_constraints(-MaxCorr+z[i,j]*Correlation.iloc[i,j]<=0 for i in N for j in range(i+1,nFactors))
    mdl.parameters.timelimit=300 #time limit
    solution = mdl.solve(log_output= True)

    print(solution.solve_status) #if it says feasible, it is not optimal
    mdl.solve_details
    print(mdl.objective_value)
    #print(solution)
    NotselectedColumns = [j for j in N if x[j].solution_value<0.01]
    return NotselectedColumns

NotselectedColumns = solveSPP(corr.shape[0], corr, pValues)

print(len(NotselectedColumns))
print(NotselectedColumns)

columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(len(NotselectedColumns)):
    columns[NotselectedColumns[i]] = False
selected_columns = data.columns[columns]
data = data[selected_columns]

print(data.shape)



CPXPARAM_TimeLimit                               300
CPXPARAM_Read_DataCheck                          1
CPXPARAM_RandomSeed                              201703173
Tried aggregator 2 times.
MIP Presolve eliminated 1400 rows and 945 columns.
MIP Presolve modified 2 coefficients.
Aggregator did 26 substitutions.
Reduced MIP has 3 rows, 22 columns, and 26 nonzeros.
Reduced MIP has 22 binaries, 0 generals, 0 SOSs, and 0 indicators.
Presolve time = 0.01 sec. (1.29 ticks)
Found incumbent of value 7.888889 after 0.03 sec. (1.41 ticks)
Probing fixed 22 vars, tightened 0 bounds.
Probing time = 0.02 sec. (0.00 ticks)
Tried aggregator 1 time.
MIP Presolve eliminated 3 rows and 22 columns.
All rows and columns eliminated.
Presolve time = 0.00 sec. (0.01 ticks)

Root node processing (before b&c):
  Real time             =    0.05 sec. (1.47 ticks)
Parallel b&c, 12 threads:
  Real time             =    0.00 sec. (0.00 ticks)
  Sync time (average)   =    0.00 sec.
  Wait time (average)   =    0.00 sec

In [5]:
selected_columns = selected_columns[1:].values
def backwardElimination(x, Y, sl, columns):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                    columns = np.delete(columns, j)
        #print(regressor_OLS.summary())
    print(regressor_OLS.summary())
    return x, columns

SL = 0.05
data_modeled, selected_columns = backwardElimination(data.iloc[:,1:].values, data.iloc[:,0].values, SL, selected_columns)




                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.837
Model:                            OLS   Adj. R-squared (uncentered):              0.833
Method:                 Least Squares   F-statistic:                              237.8
Date:                Fri, 11 Oct 2019   Prob (F-statistic):                   3.94e-210
Time:                        12:02:55   Log-Likelihood:                         -10.921
No. Observations:                 569   AIC:                                      45.84
Df Residuals:                     557   BIC:                                      97.97
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [6]:
#Moving the result to a new Dataframe
result = pd.DataFrame()
result['diagnosis'] = data.iloc[:,0]


#Creating a Dataframe with the columns selected using the p-value and correlation
data = pd.DataFrame(data = data_modeled, columns = selected_columns)



In [7]:
x_train, x_test, y_train, y_test = train_test_split(data.values, result.values, test_size = 0.2, random_state = 42)

#Support Vector Classifier
svc=SVC() # The default kernel used by SVC is the gaussian kernel
svc.fit(x_train, y_train)


prediction = svc.predict(x_test)

cm = confusion_matrix(y_test, prediction)
print(accuracy_score(y_test, prediction))
print(cm)
print (roc_auc_score(y_test, prediction))


0.9298245614035088
[[67  4]
 [ 4 39]]
0.9253193580085161


In [8]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
prediction=logreg.predict(x_test)


cm = confusion_matrix(y_test, prediction)
print(accuracy_score(y_test, prediction))
print(cm)
print (roc_auc_score(y_test, prediction))

0.9649122807017544
[[71  0]
 [ 4 39]]
0.9534883720930232
