In [None]:
import pandas as pd  
import numpy as np  
from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import classification_report, confusion_matrix  
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [None]:
#df = pd.read_csv('C:\\Users\\fd299212\\Desktop\\lab_Stuff\\collaborations\\cady\\machineLearning\\lyme_data_gcfp.txt',sep='\t')
df = pd.read_csv('C:\\Users\\fd299212\\lyme_data_20220520.csv')
df.head()

In [None]:
#create a new column in DF and fully populate with "Neg"
#then alter to Pos for any 'Diag' column values that are not equal to "Neg" (various positive states)
df['bin_diag'] = "Neg"
df.loc[df['Diag']!="Neg", 'bin_diag'] = "Pos"


In [None]:
df.head()

In [None]:
#df['Diag'].value_counts()
df.info()

In [None]:
df.columns
#df.dtypes

In [None]:
from sklearn.model_selection import train_test_split
#X = df.drop(['Diag','ID'], axis=1)  
#X = df.filter(['VlsE', 'DbpA', 'P58', 'OspC','ErpL','DbpB'],axis=1)  
X = df.filter(['VlsE', 'DbpA', 'P58', 'OspC','ErpL','P66'],axis=1)  

#Data Standardization gives the data zero mean and unit variance, it is considered good practice, 
#especially for algorithms such as KNN which is based on the distance of data points
#however, there is some disagreement about it for logistic regression...may require testing for specific dataset results
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))
X[0:5]
y = df['bin_diag']
#split original dataset into training and testing subsets
#stratify=y ensures that the sampled sets attempt to represent each class's proportions as they were in the full set
#the 'y' does not mean 'yes' it is the y vectors of class labels
#note, random_state provides specific seed for pseudorandom generator to allow reproducible analysis of the model
#remove this parameter to allow random selection each run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#the hyperparameters being fed to the gridsearch in this case may include some that are not 
#applicable with each other. Invocations with those may raise warnings that should be able to be ignored
#but to the degree practicable, feed compatible parameters together...

#we define the set of parameter values that will be passed in as "param_grid"
#max_iter is set very high due to non convergence errors that had been occurring. This can be revisited as data set 
#continues to grow
param_grid = [{'C': [.1,1,2.5,5], 'penalty': ['none','l2'],'solver': ['lbfgs','newton-cg', 'sag'], 'max_iter':[8000]},             
             {'C': [.1,1,2.5,5], 'penalty': ['none','l1','l2','elasticnet'],'solver': ['saga'], 'max_iter':[8000]},
              {'C': [.1,1,2.5,5], 'penalty': ['l1','l2'],'solver': ['liblinear'], 'max_iter':[2000]}]

logreg=LogisticRegression()
#this code implements the grid search
grid = GridSearchCV(logreg,param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

In [None]:
print(grid.best_estimator_.penalty)

In [None]:
#from sklearn.model_selection import cross_val_score
print(grid.best_estimator_.get_params())



In [None]:
model = grid.best_estimator_
#following lines are not needed as gridsearchCV does 'refit' (retrains best estimator on full set provided[the whole
#training set in this case]) by default
#model.set_params(max_iter=5000)
#model.fit(X_train, y_train)

yhat = model.predict(X_test)
print('Predicted Classes:')
print(yhat)
print('Actual Classes:')
print(y_test)

score = model.score(X_test,y_test)
# report the model performance
print('Accuracy: %.3f ' % (score))

In [None]:
# predict probabilities with a multinomial logistic regression model
from sklearn.datasets import make_classification
# predict a multinomial probability distribution
yprobs = model.predict_proba(X_test)
# summarize the predicted probabilities
print('Predicted Probabilities:')
print(yprobs)

In [None]:
#following is test code to output the probabilities in an easy to 
#read format using the dataframe display and format options
import pandas as pd
def plot_probabilities(prob_array, col_labels, sample_indices):
    if yprobs.shape[1] == len(classes):
        prob_df = pd.DataFrame(prob_array, columns=col_labels)
        prob_df['original sample index'] = sample_indices
        pd.set_option('display.float_format', lambda x: '%.3f' % x)
        pd.set_option('display.precision', 3)
        display(prob_df)
    else:
        print('Incorrect label list length')
        
classes = ['Negative','Positive']   
rows = y_test.index
print("Probabilities:")
plot_probabilities(yprobs , classes, rows)

In [None]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix, confusion_matrix
import itertools

#code from https://scikit-learn.org/0.18/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Compute confusion matrix
from sklearn.metrics import classification_report, multilabel_confusion_matrix, confusion_matrix
cnf_matrix = confusion_matrix(y_test, yhat)
np.set_printoptions(precision=2)

print (classification_report(y_test, yhat))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Negative','Pos'],normalize= False,  title='Confusion matrix')