In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sb
import pickle
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 8,10

In [2]:
long_df = pd.read_csv("oasis_longitudinal.csv")

In [3]:
long_df = long_df.drop("Subject ID",axis = 1)
long_df = long_df.drop("MRI ID",axis =1)
#dropping Hand column
long_df=long_df.drop("Hand",axis=1)
long_df.SES = long_df.SES.fillna(round(long_df.SES.mean()))
long_df.MMSE = long_df.MMSE.fillna(round(long_df.MMSE.mean()))
long_df['Group'] = long_df['Group'].replace('Converted','Demented') 
#creating dummy variables
long_df = pd.get_dummies(data= long_df,columns = {'Group','M/F'})
long_df = long_df.rename(columns={'M/F_F':'Female','M/F_M':'Male','Group_Demented':'Demented','Group_Nondemented':'Non-Demented'})
#Male=1 and Female=0
long_df=long_df.drop("Female",axis=1)
long_df = long_df.rename(columns={"Male":"Gender"})
#Demented=1 and Non-demented=0
long_df = long_df.drop("Non-Demented",axis=1)
long_df = long_df.rename(columns={"Demented":"Group"})

In [4]:
lower_df=long_df[long_df['Age']<long_df['Age'].median()]
upper_df=long_df[long_df['Age']>=long_df['Age'].median()]

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc
from sklearn.preprocessing import MinMaxScaler

In [6]:
columns=['Gender', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF']
performance=[]
features=[]

## Logistic Regression for Lower Median Age

In [7]:
def split_data():
    y = lower_df['Group']
    x = lower_df[['Gender', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF']]
    X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.33,random_state = 0)
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train,X_test,Y_train,Y_test

In [8]:
X_train,X_test,Y_train,Y_test = split_data()

  return self.partial_fit(X, y)


In [9]:
LogModel_l= LogisticRegression(solver='lbfgs').fit(X_train,Y_train)#vanilla logistic regression
Y_predicted = LogModel_l.predict(X_test)
acc = accuracy_score(Y_test,Y_predicted)
recall = recall_score(Y_test,Y_predicted,pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test,Y_predicted,pos_label=1)
AUC = auc(fpr,tpr)
performance.append(['Logistic Regression',acc,recall,AUC,fpr,tpr,thresholds])
performance

[['Logistic Regression',
  0.6721311475409836,
  0.6875,
  0.6713362068965517,
  array([0.        , 0.34482759, 1.        ]),
  array([0.    , 0.6875, 1.    ]),
  array([2, 1, 0])]]

In [10]:
LogModel_l.coef_.T

array([[ 0.84485058],
       [-1.34161391],
       [-0.08345326],
       [-2.21927517],
       [-0.10039206],
       [-1.94077453],
       [ 0.12799545]])

In [11]:
np.array([columns]).T

array([['Gender'],
       ['EDUC'],
       ['SES'],
       ['MMSE'],
       ['eTIV'],
       ['nWBV'],
       ['ASF']], dtype='<U6')

In [12]:
feature_importance=pd.DataFrame(np.hstack((np.array([columns]).T, LogModel_l.coef_.T)), 
                                columns=['feature', 'importance'])
feature_importance['importance']=pd.to_numeric(feature_importance['importance'])
feature_importance.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
0,Gender,0.844851
6,ASF,0.127995
2,SES,-0.083453
4,eTIV,-0.100392
1,EDUC,-1.341614
5,nWBV,-1.940775
3,MMSE,-2.219275


## Logistic Regression for upper Median Age

In [13]:
def split_data():
    y = upper_df['Group']
    x = upper_df[['Gender', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF']]
    X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.33,random_state = 0)
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train,X_test,Y_train,Y_test

In [14]:
X_train,X_test,Y_train,Y_test = split_data()

  return self.partial_fit(X, y)


In [15]:
LogModel_u= LogisticRegression(solver='lbfgs').fit(X_train,Y_train)#vanilla logistic regression
Y_predicted = LogModel_u.predict(X_test)
acc = accuracy_score(Y_test,Y_predicted)
recall = recall_score(Y_test,Y_predicted,pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test,Y_predicted,pos_label=1)
AUC = auc(fpr,tpr)
performance.append(['Logistic Regression',acc,recall,AUC,fpr,tpr,thresholds])
performance

[['Logistic Regression',
  0.6721311475409836,
  0.6875,
  0.6713362068965517,
  array([0.        , 0.34482759, 1.        ]),
  array([0.    , 0.6875, 1.    ]),
  array([2, 1, 0])],
 ['Logistic Regression',
  0.71875,
  0.6206896551724138,
  0.7103448275862069,
  array([0. , 0.2, 1. ]),
  array([0.        , 0.62068966, 1.        ]),
  array([2, 1, 0])]]

In [16]:
LogModel_u.coef_.T

array([[ 1.41535882],
       [-1.0135976 ],
       [-0.23900738],
       [-2.96827451],
       [-1.26364737],
       [-1.5163828 ],
       [ 1.01810868]])

In [17]:
np.array([columns]).T

array([['Gender'],
       ['EDUC'],
       ['SES'],
       ['MMSE'],
       ['eTIV'],
       ['nWBV'],
       ['ASF']], dtype='<U6')

In [18]:
feature_importance=pd.DataFrame(np.hstack((np.array([columns]).T, LogModel_u.coef_.T)), 
                                columns=['feature', 'importance'])
feature_importance['importance']=pd.to_numeric(feature_importance['importance'])
feature_importance.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
0,Gender,1.415359
6,ASF,1.018109
2,SES,-0.239007
1,EDUC,-1.013598
4,eTIV,-1.263647
5,nWBV,-1.516383
3,MMSE,-2.968275


## Logistic Regression for Entire Data Set

In [19]:
def split_data():
    y = long_df['Group']
    x = long_df[['Gender', 'EDUC', 'SES', 'MMSE', 'eTIV',
       'nWBV', 'ASF']]
    X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.33,random_state = 0)
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train,X_test,Y_train,Y_test

In [20]:
X_train,X_test,Y_train,Y_test = split_data()

  return self.partial_fit(X, y)


In [21]:
LogModel= LogisticRegression(solver='lbfgs').fit(X_train,Y_train)#vanilla logistic regression
Y_predicted = LogModel.predict(X_test)
acc = accuracy_score(Y_test,Y_predicted)
recall = recall_score(Y_test,Y_predicted,pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test,Y_predicted,pos_label=1)
AUC = auc(fpr,tpr)
performance.append(['Logistic Regression',acc,recall,AUC,fpr,tpr,thresholds])
performance

[['Logistic Regression',
  0.6721311475409836,
  0.6875,
  0.6713362068965517,
  array([0.        , 0.34482759, 1.        ]),
  array([0.    , 0.6875, 1.    ]),
  array([2, 1, 0])],
 ['Logistic Regression',
  0.71875,
  0.6206896551724138,
  0.7103448275862069,
  array([0. , 0.2, 1. ]),
  array([0.        , 0.62068966, 1.        ]),
  array([2, 1, 0])],
 ['Logistic Regression',
  0.7419354838709677,
  0.7241379310344828,
  0.7408568443051201,
  array([0.        , 0.24242424, 1.        ]),
  array([0.        , 0.72413793, 1.        ]),
  array([2, 1, 0])]]

In [22]:
LogModel.coef_.T

array([[ 1.13231914],
       [-1.18723751],
       [-0.30888215],
       [-4.19271756],
       [-0.77937985],
       [-2.17057053],
       [ 0.48214063]])

In [23]:
np.array([columns]).T

array([['Gender'],
       ['EDUC'],
       ['SES'],
       ['MMSE'],
       ['eTIV'],
       ['nWBV'],
       ['ASF']], dtype='<U6')

In [24]:
feature_importance=pd.DataFrame(np.hstack((np.array([columns]).T, LogModel.coef_.T)), 
                                columns=['feature', 'importance'])
feature_importance['importance']=pd.to_numeric(feature_importance['importance'])
feature_importance.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
0,Gender,1.132319
6,ASF,0.482141
2,SES,-0.308882
4,eTIV,-0.77938
1,EDUC,-1.187238
5,nWBV,-2.170571
3,MMSE,-4.192718


In [25]:
performance

[['Logistic Regression',
  0.6721311475409836,
  0.6875,
  0.6713362068965517,
  array([0.        , 0.34482759, 1.        ]),
  array([0.    , 0.6875, 1.    ]),
  array([2, 1, 0])],
 ['Logistic Regression',
  0.71875,
  0.6206896551724138,
  0.7103448275862069,
  array([0. , 0.2, 1. ]),
  array([0.        , 0.62068966, 1.        ]),
  array([2, 1, 0])],
 ['Logistic Regression',
  0.7419354838709677,
  0.7241379310344828,
  0.7408568443051201,
  array([0.        , 0.24242424, 1.        ]),
  array([0.        , 0.72413793, 1.        ]),
  array([2, 1, 0])]]