# No More Alzheimer's Disease

## Data Downloading

In [134]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from timeit import default_timer as timer

In [135]:
def read_and_merge(labelsDir, featuresDir, fieldsToDrop, savePath = None):
    '''
     Reads all of the files and attempts to merge them based on the VISCODE and RID attributes. 
     This features some basic data cleaning to allow the classifier to function. 
     
     Args :
        labelsDir (String) : Directory path to the label data.
        featuresDir (String) : Directory path to feature data.
        fieldsToDrop (String[]) : List of fields to drop.
     Returns :
        SCD, MCI, AD (DataFrame) : Retuns a DataFrame of each class.'''

    # Read the two files that are necessary
    features = pd.read_csv(featuresDir)
    labels = pd.read_csv(labelsDir)

    # Now merge these two files and remove any rows that have null values
    merged_data = pd.merge(features, labels[['RID', 'VISCODE', 'DX']], on=['RID', 'VISCODE'], how='left').dropna()
    # Update Stamp is not relevant for this test
    merged_data.drop(fieldsToDrop, axis=1, inplace=True)
    # We are not interested in classifying dementia
    merged_data = merged_data.replace("Dementia", "AD").replace("CN", "SCD")

    MCI = merged_data.loc[merged_data["DX"] == "MCI"]
    SCD = merged_data.loc[merged_data["DX"] == "SCD"]
    AD = merged_data.loc[merged_data["DX"] == "AD"]

    if savePath != None:
        merged_data.to_csv(savePath)

    return SCD, MCI, AD

    

# Construct the Classifier

In [136]:
def getXy(df):
    '''
    Seperate the label from the feature data.
    
    Args : 
        df (DataFrame) : The data to be seperated
    Returns :
        X (DataFrame) : The feature data.
        y (list) : The labels associated.'''
    
    # Independant
    X = df.iloc[:, [1, 2]].values
    # Dependant
    y = df.iloc[:, 3].values

    return X, y

In [137]:
def split_test_data(SCD, MCI, AD, TestingFactor = 0.25):
    '''
        Now we seperate the training and testing data. Currently using a 25% training test split.

        Args : 
            SCD, MCI, AD (DataFrame) : Data to be split, already split into classes.
            TestingFactor (int) [OPTIONAL] : Amount of data put in testing.
        Returns :
            SCD, MCI, AD, TestData (DataFrame) : Each of the sections of data.
    
    '''

    SCD, TempSCD = train_test_split(SCD, test_size=TestingFactor)
    MCI, TempMCI = train_test_split(MCI, test_size=TestingFactor)
    AD, TempAD = train_test_split(AD, test_size=TestingFactor)

    # concatenate the lists
    TempData = [TempSCD, TempMCI, TempAD]
    TestData = pd.concat(TempData)

    # return required info
    return SCD, MCI, AD, TestData

In [138]:
def construct_svm(df1, df2):
    '''
        Constructs an SVM with the datasets provided.

        Args :
            Datasets (DataFrame) : Two DataFrames that the SVM must be made upon.
        Returns :
            Classifier (SVC) : This is the margin that the data must be acted upon.
    '''

    # First the two dataframes should be combined
    df = [df1, df2]
    df = pd.concat(df)
    # Get data in the format required
    X, y = getXy(df)
    # Train the classifier
    sc = StandardScaler()
    X = sc.fit_transform(X)
    # Fit to the classifier
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X, y)

    return classifier, sc

In [139]:
def test(classifier, scaler, X):
    '''
        Test the dataset with each individual SVM

        Args :
            classifier (SVM) : The Support Vector Machine used for this test
            scaler (StandardScaler) : This allows the test data to be scaled to the same proportions as the test data
            X (DataFrame) : The feature data WITHOUT labels
        Returns :
            y_pred (list) : the predicted y-value for each item
    '''
    # now perform the classification
    X = scaler.fit_transform(X)
    y_pred = classifier.predict(X)
    # return the result of this transcation
    return y_pred

In [140]:
def bdt(file_path):
    '''
        Perform the Binary Decision Tree SVM classification method
        
        Args :
            file_path (String) : Path to the data
        Returns :
            time (float) : Time taken to perform the test.
            accuracy (float) : The accuracy of the model.
            confusion_matrix = Confusion matrix for the accuracy.'''
    
    start = timer()
    # read the dataframe and clean
    SCD, MCI, AD = read_and_merge('Data/ADNIMERGE_15Jun2023.csv', file_path, ['update_stamp', 'VISCODE'], 'Data/TempFiles/mergeddata.csv')
    
    # get the test data required, leave training data
    SCD, MCI, AD, TestData = split_test_data(SCD, MCI, AD)
    

    # seperate the labels and the data features
    X_test, y_test = getXy(TestData)

    # Concatenate MCI
    MCIoAD = [MCI, AD]
    MCIoAD = pd.concat(MCIoAD)
    # Map to the same class
    MCIoAD = MCIoAD.replace("MCI", "MCIoAD").replace("AD", "MCIoAD")

    # Test to perform the first step of the DT
    SCDMCIAD, scalersma = construct_svm(SCD, MCIoAD)
    # SVM for the second level of the BDT
    MCIAD, scalarma = construct_svm(MCI, AD)

    # Perform the test for the first level of BDT
    SCDoMCIAD = test(SCDMCIAD, scalersma, X_test)

    # Now run entire set through the MCI, AD classifier. 
    # However, only the non-SCD items in the previous test will be used
    MCIoAD = test(MCIAD, scalarma, X_test)

    # Now colalate the results together taking the two classifiers into account
    results = []
    
    for i in range(len(SCDoMCIAD)):
        if SCDoMCIAD[i] == 'SCD':
            results.append('SCD')
        else:
            # Adds the result of the second level
            results.append(MCIoAD[i])

    # print(SCDoMCIAD)
    print(MCIoAD)

    # Print out the time taken and results
    end = timer()
    print("Time Taken : " + str(end-start))

    # construct a confusion matrix
    cm = confusion_matrix(y_test, results)
    print(cm)
    print("Accuracy : " + str(accuracy_score(y_test, results)))

In [141]:
bdt("Data/Plasma/UPENNPLASMA_27Nov2023.csv")

['MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD'
 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIoAD' 'MCIo

  labels = pd.read_csv(labelsDir)
