In [1]:
import pandas as pd
import numpy as np
from scipy.io.arff import loadarff 

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


In [2]:
raw_data = loadarff('data/column_2C_weka.arff')
df = pd.DataFrame(raw_data[0])

In [3]:
df

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.254400,b'Abnormal'
1,39.056951,10.060991,25.015378,28.995960,114.405425,4.564259,b'Abnormal'
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,b'Abnormal'
3,69.297008,24.652878,44.311238,44.644130,101.868495,11.211523,b'Abnormal'
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,b'Abnormal'
...,...,...,...,...,...,...,...
305,47.903565,13.616688,36.000000,34.286877,117.449062,-4.245395,b'Normal'
306,53.936748,20.721496,29.220534,33.215251,114.365845,-0.421010,b'Normal'
307,61.446597,22.694968,46.170347,38.751628,125.670725,-2.707880,b'Normal'
308,45.252792,8.693157,41.583126,36.559635,118.545842,0.214750,b'Normal'


In [4]:
df['class'] = df['class'].astype('string')
df = df.replace("b'Abnormal'", '1')
df = df.replace("b'Normal'", '0')
df['class'] = df['class'].astype('float64')

In [5]:
df

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.254400,1.0
1,39.056951,10.060991,25.015378,28.995960,114.405425,4.564259,1.0
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,1.0
3,69.297008,24.652878,44.311238,44.644130,101.868495,11.211523,1.0
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,1.0
...,...,...,...,...,...,...,...
305,47.903565,13.616688,36.000000,34.286877,117.449062,-4.245395,0.0
306,53.936748,20.721496,29.220534,33.215251,114.365845,-0.421010,0.0
307,61.446597,22.694968,46.170347,38.751628,125.670725,-2.707880,0.0
308,45.252792,8.693157,41.583126,36.559635,118.545842,0.214750,0.0


In [6]:
df.isnull().sum()

pelvic_incidence            0
pelvic_tilt                 0
lumbar_lordosis_angle       0
sacral_slope                0
pelvic_radius               0
degree_spondylolisthesis    0
class                       0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
count,310.0,310.0,310.0,310.0,310.0,310.0,310.0
mean,60.496653,17.542822,51.93093,42.953831,117.920655,26.296694,0.677419
std,17.23652,10.00833,18.554064,13.423102,13.317377,37.559027,0.46822
min,26.147921,-6.554948,14.0,13.366931,70.082575,-11.058179,0.0
25%,46.430294,10.667069,37.0,33.347122,110.709196,1.603727,0.0
50%,58.691038,16.357689,49.562398,42.404912,118.268178,11.767934,1.0
75%,72.877696,22.120395,63.0,52.695888,125.467674,41.287352,1.0
max,129.834041,49.431864,125.742385,121.429566,163.071041,418.543082,1.0


In [8]:
# split into explanatory and response variables 
X = df.drop('class', axis = 1)
Y = df['class']


In [9]:
model = LogisticRegression()
model.fit(X,Y)

print("Coefficients: ",model.coef_)
print("Intercept: ", model.intercept_)

# compute predicted values from training set
Y_pred = model.predict(X)

cm = confusion_matrix(Y, Y_pred)
print("Confusion matrix:\n",cm)

accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print("Accuracy calculated from the training set = %.3f" % (accuracy))

print(classification_report(Y, Y_pred, target_names=['no', 'yes']))

Coefficients:  [[-0.0070301   0.08258075 -0.01870319 -0.08961076 -0.1067678   0.16811009]]
Intercept:  [15.15499493]
Confusion matrix:
 [[ 78  22]
 [ 22 188]]
Accuracy calculated from the training set = 0.858
              precision    recall  f1-score   support

          no       0.78      0.78      0.78       100
         yes       0.90      0.90      0.90       210

    accuracy                           0.86       310
   macro avg       0.84      0.84      0.84       310
weighted avg       0.86      0.86      0.86       310



In [10]:
# cross-validate
# number of folds
k = 10
scores = cross_val_score(estimator=reg,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
print("Accuracies from %d individual folds:" % k)
print(scores)
print("Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))

NameError: name 'reg' is not defined