In [None]:
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sb
import pickle
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 10,10

In [None]:
import statsmodels.api as sm

In [None]:
long_df = pd.read_csv("oasis_longitudinal.csv")

In [None]:
long_df = long_df.drop("Subject ID",axis = 1)
long_df = long_df.drop("MRI ID",axis =1)
#dropping Hand column
long_df=long_df.drop("Hand",axis=1)
long_df.SES = long_df.SES.fillna(round(long_df.SES.mean()))
long_df.MMSE = long_df.MMSE.fillna(round(long_df.MMSE.mean()))
long_df['Group'] = long_df['Group'].replace('Converted','Demented') 
#creating dummy variables
long_df = pd.get_dummies(data= long_df,columns = {'Group','M/F'})
long_df = long_df.rename(columns={'M/F_F':'Female','M/F_M':'Male','Group_Demented':'Demented','Group_Nondemented':'Non-Demented'})
#Male=1 and Female=0
long_df=long_df.drop("Female",axis=1)
long_df = long_df.rename(columns={"Male":"Gender"})
#Demented=1 and Non-demented=0
long_df = long_df.drop("Non-Demented",axis=1)
long_df = long_df.rename(columns={"Demented":"Group"})

In [None]:
male_df = long_df[long_df['Gender']==1].drop("Gender",axis=1)
female_df = long_df[long_df['Gender']==0].drop("Gender",axis=1)

In [None]:
columns =['Age', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF']

In [None]:
X_train_male = male_df[['Age', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF']].to_numpy()
Y_train_male = male_df['Group'].to_numpy()
X_train_female = female_df[['Age', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF']].to_numpy()
Y_train_female = female_df['Group'].to_numpy()
X_train = long_df[['Age', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF']].to_numpy()
Y_train = long_df['Group'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
def scale(X):
    scaler = MinMaxScaler().fit(X)
    X=scaler.transform(X)
    return X

In [None]:
X_train = scale(X_train)
X_train_male = scale(X_train_male)
X_train_female = scale(X_train_female)

In [None]:
performance = [['Model','Trained on Type of data','Predicting data','Accuracy in %','Sensitivity','Specificity']]
factors =[]

In [None]:
def plt_confusion_matrix(Y,Y_predicted,title='Confusion Matrix'):
    cf = confusion_matrix(Y, Y_predicted)
    df_cm = pd.DataFrame(cf,columns=['Non-demented','Demented'],index=['Non-demented','Demented'])
    sb.heatmap(df_cm,annot=True,fmt='g',cbar=False)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title(title)

## Logistic Regression for entire data

In [None]:
LogModel= LogisticRegression(solver='lbfgs').fit(X_train,Y_train)
Y_predicted = LogModel.predict(X_train)
acc = accuracy_score(Y_train,Y_predicted)
plt_confusion_matrix(Y_train,Y_predicted,title='LogModel on entire data')
plt.savefig('LogModel on entire data.png')
cf = confusion_matrix(Y_train, Y_predicted)
tn, fp, fn, tp = cf.ravel()
sensitivity= tp/(tp+fn)
specificity = tn/(tn+fp)
performance.append(['Logistic Regression','Entire dataset','Entire dataset',acc*100,sensitivity,specificity])

In [None]:
feature_importance=pd.DataFrame(np.hstack((np.array([columns]).T, LogModel.coef_.T)), 
                                columns=['feature', 'importance'])
feature_importance

In [None]:
#predicting on male data
Y_predicted_male = LogModel.predict(X_train_male)
acc = accuracy_score(Y_train_male,Y_predicted_male)
plt_confusion_matrix(Y_train_male,Y_predicted_male,title='LogModel on Male data')
plt.savefig('LogModel on Male data.png')
cf = confusion_matrix(Y_train_male, Y_predicted_male)
tn = cf[0][0]
fp = cf[0][1]
fn = cf[1][0]
tp = cf[1][1]
sensitivity= tp/(tp+fn)
specificity = tn/(tn+fp)
performance.append(['Logistic Regression','Entire dataset','Male dataset',acc*100,sensitivity,specificity])

In [None]:
#predicting on female data
Y_predicted_female = LogModel.predict(X_train_female)
acc = accuracy_score(Y_train_female,Y_predicted_female)
plt_confusion_matrix(Y_train_female,Y_predicted_female,title='LogModel on Female data')
plt.savefig('LogModel on Female data.png')
cf = confusion_matrix(Y_train_female, Y_predicted_female)
tn = cf[0][0]
fp = cf[0][1]
fn = cf[1][0]
tp = cf[1][1]
sensitivity= tp/(tp+fn)
specificity = tn/(tn+fp)
performance.append(['Logistic Regression','Entire dataset','Female dataset',acc*100,sensitivity,specificity])

## Important Features for LogModel

In [None]:
print('Age', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF')
rfe = RFE(LogModel, 4)
fit = rfe.fit(X_train, Y_train)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

In [None]:
feature_importance=pd.DataFrame(np.hstack((np.array([columns]).T, LogModel.coef_.T)), 
                                columns=['feature', 'importance'])
feature_importance

## Logistic Regression for male data

In [None]:
LogModel_m = LogisticRegression(solver='lbfgs').fit(X_train_male,Y_train_male)
Y_predicted_male = LogModel_m.predict(X_train_male)
acc = accuracy_score(Y_train_male,Y_predicted_male)
plt_confusion_matrix(Y_train_male,Y_predicted_male,title='Male LogModel on Male data')
plt.savefig('Male LogModel on Male data.png')
cf = confusion_matrix(Y_train_male, Y_predicted_male)
tn = cf[0][0]
fp = cf[0][1]
fn = cf[1][0]
tp = cf[1][1]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
performance.append(['Logistic Regression(Male)','Male dataset','Male dataset',acc*100,sensitivity,specificity])

## Important features for Male LogModel

In [None]:
print('Age', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF')
rfe = RFE(LogModel_m, 4)
fit = rfe.fit(X_train_male, Y_train_male)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

In [None]:
feature_importance=pd.DataFrame(np.hstack((np.array([columns]).T, LogModel_m.coef_.T)), 
                                columns=['feature', 'importance'])
feature_importance

## Comparing ROC curves on male data by LogModel and LogModel_m

In [None]:
#predicting probabilities of the predicted class
y_pred_prob = LogModel.predict_proba(X_train_male)[:,1]
y_pred_prob_male = LogModel_m.predict_proba(X_train_male)[:,1]
#Plotting ROC curve
fpr, tpr, _ = roc_curve(Y_train_male, y_pred_prob)
fpr_m, tpr_m, _ = roc_curve(Y_train_male, y_pred_prob_male)
AUC = auc(fpr,tpr)
AUC_m = auc(fpr_m,tpr_m)
plt.plot(fpr, tpr,label='Global Model(AUC= % 0.2f)' % AUC)
plt.plot(fpr_m, tpr_m,label='Male Model(AUC= % 0.2f)' % AUC_m)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve (Male data)')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig('ROC curve(Male data).png')

## Logistic Regression for female data

In [None]:
LogModel_f = LogisticRegression(solver='lbfgs').fit(X_train_female,Y_train_female)
Y_predicted_female = LogModel_f.predict(X_train_female)
acc = accuracy_score(Y_train_female,Y_predicted_female)
plt_confusion_matrix(Y_train_female,Y_predicted_female,title='Female LogModel on Female data')
plt.savefig('Female LogModel on Female data.png')
cf = confusion_matrix(Y_train_female, Y_predicted_female)
tn = cf[0][0]
fp = cf[0][1]
fn = cf[1][0]
tp = cf[1][1]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
performance.append(['Logistic Regression(Female)','Female dataset','Female dataset',acc*100,sensitivity,specificity])

## Important features for Female Log Model

In [None]:
print('Age', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF')
rfe = RFE(LogModel, 4)
fit = rfe.fit(X_train_female, Y_train_female)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

In [None]:
feature_importance=pd.DataFrame(np.hstack((np.array([columns]).T, LogModel_f.coef_.T)), 
                                columns=['feature', 'importance'])
feature_importance

## Comparing ROC curves on female data by LogModel and LogModel_f

In [None]:
#predicting probabilities of the predicted class
y_pred_prob = LogModel.predict_proba(X_train_female)[:,1]
y_pred_prob_f = LogModel_f.predict_proba(X_train_female)[:,1]
#Plotting ROC curve
fpr, tpr, _ = roc_curve(Y_train_female, y_pred_prob)
fpr_f, tpr_f, _ = roc_curve(Y_train_female, y_pred_prob_f)
AUC = auc(fpr,tpr)
AUC_f = auc(fpr_f,tpr_f)
plt.plot(fpr, tpr,label='Global Model(AUC= % 0.2f)' % AUC)
plt.plot(fpr_f, tpr_f,label='Female Model(AUC= % 0.2f)' % AUC_f)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve (Female data)')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig('ROC curve(Female data).png')

In [None]:
performance

In [None]:
import csv
with open('Performance.csv', 'w', newline='') as myfile:
    for entries in performance:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(entries)
        wr.writerow("\n")