# Using OASIS-2 dataset for prediction

## The OASIS-2 dataset is a longitudinal dataset, containing data of patients over a period of time for a maximum of 5 visits
## We train a regression model on previous patient visits and try to predict wheather in the next visit the patient would be rated as demented or not. 

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df_long = pd.read_csv('data/oasis_longitudinal.csv')
df_long

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.010
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,OAS2_0185,OAS2_0185_MR2,Demented,2,842,M,R,82,16,1.0,28.0,0.5,1693,0.694,1.037
369,OAS2_0185,OAS2_0185_MR3,Demented,3,2297,M,R,86,16,1.0,26.0,0.5,1688,0.675,1.040
370,OAS2_0186,OAS2_0186_MR1,Nondemented,1,0,F,R,61,13,2.0,30.0,0.0,1319,0.801,1.331
371,OAS2_0186,OAS2_0186_MR2,Nondemented,2,763,F,R,63,13,2.0,30.0,0.0,1327,0.796,1.323


### Data cleaning to create a suitable dataframe for time-series analysis, using a regression model

In [4]:
df_long['Visit'].isna().sum()

0

In [5]:
df_long.isna().sum()

Subject ID     0
MRI ID         0
Group          0
Visit          0
MR Delay       0
M/F            0
Hand           0
Age            0
EDUC           0
SES           19
MMSE           2
CDR            0
eTIV           0
nWBV           0
ASF            0
dtype: int64

In [6]:
df_long.dropna(subset = ['SES'], inplace = True)
df_long.dropna(subset = ['MMSE'], inplace = True)

In [7]:
df_long['Hand'].unique()

array(['R'], dtype=object)

In [8]:
df_long.drop(columns = ['MRI ID', 'Hand'], inplace = True)

In [9]:
df_long['M/F'] = df_long['M/F'].map({'M': 1, 'F': 0})
df_long = df_long.rename(columns = {'M/F': 'Gender'})

In [10]:
# Convert 'Group' column to 1 if 'CDR' > 0 else covert to 0

df_long['Group'] = (df_long['CDR'] > 0).astype(int)

In [11]:
df_long

Unnamed: 0,Subject ID,Group,Visit,MR Delay,Gender,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,0,1,0,1,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,0,2,457,1,88,14,2.0,30.0,0.0,2004,0.681,0.876
5,OAS2_0004,0,1,0,0,88,18,3.0,28.0,0.0,1215,0.710,1.444
6,OAS2_0004,0,2,538,0,90,18,3.0,27.0,0.0,1200,0.718,1.462
7,OAS2_0005,0,1,0,1,80,12,4.0,28.0,0.0,1689,0.712,1.039
...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,OAS2_0185,1,2,842,1,82,16,1.0,28.0,0.5,1693,0.694,1.037
369,OAS2_0185,1,3,2297,1,86,16,1.0,26.0,0.5,1688,0.675,1.040
370,OAS2_0186,0,1,0,0,61,13,2.0,30.0,0.0,1319,0.801,1.331
371,OAS2_0186,0,2,763,0,63,13,2.0,30.0,0.0,1327,0.796,1.323


In [12]:
df_long['Group'].value_counts()

Group
0    206
1    148
Name: count, dtype: int64

In [13]:
# We drop the CDR column as the prediction is done on 'Group' column,
# and CDR is just a more accurate rating of dementia rate

df_long.drop(columns = ['CDR'], inplace = True)

In [14]:
# We know that'Group' column as the first column
# We select the 'Group' column
group_column = df_long.pop('Group')

# Add the 'Group' column to the DataFrame at the last position for ease of extraction
df_long['Group'] = group_column

In [15]:
df_long

Unnamed: 0,Subject ID,Visit,MR Delay,Gender,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF,Group
0,OAS2_0001,1,0,1,87,14,2.0,27.0,1987,0.696,0.883,0
1,OAS2_0001,2,457,1,88,14,2.0,30.0,2004,0.681,0.876,0
5,OAS2_0004,1,0,0,88,18,3.0,28.0,1215,0.710,1.444,0
6,OAS2_0004,2,538,0,90,18,3.0,27.0,1200,0.718,1.462,0
7,OAS2_0005,1,0,1,80,12,4.0,28.0,1689,0.712,1.039,0
...,...,...,...,...,...,...,...,...,...,...,...,...
368,OAS2_0185,2,842,1,82,16,1.0,28.0,1693,0.694,1.037,1
369,OAS2_0185,3,2297,1,86,16,1.0,26.0,1688,0.675,1.040,1
370,OAS2_0186,1,0,0,61,13,2.0,30.0,1319,0.801,1.331,0
371,OAS2_0186,2,763,0,63,13,2.0,30.0,1327,0.796,1.323,0


In [16]:
df_long = df_long.reset_index(drop=True)
df = df_long

In [17]:
df.dtypes

Subject ID     object
Visit           int64
MR Delay        int64
Gender          int64
Age             int64
EDUC            int64
SES           float64
MMSE          float64
eTIV            int64
nWBV          float64
ASF           float64
Group           int32
dtype: object

In [18]:
df.isna().sum()

Subject ID    0
Visit         0
MR Delay      0
Gender        0
Age           0
EDUC          0
SES           0
MMSE          0
eTIV          0
nWBV          0
ASF           0
Group         0
dtype: int64

### Operating on cleaned data to convert to a form suitable fro time-series analysis

In [19]:
# 'Subject ID' is the column containing unique patient IDs
total_unique_patients = df_long['Subject ID'].nunique()

# Total number of unique patients
print("Total number of unique patients:", total_unique_patients)

Total number of unique patients: 142


In [20]:
# 'Subject ID' is the column containing unique patient IDs
# So we group by 'Subject ID' and count the number of unique visits

visit_counts = df_long.groupby('Subject ID')['Visit'].nunique()
patients_with_counts = visit_counts.value_counts()

# Print the total number of patients with counts 1, 2, 3, 4, and 5
print("Number of patients with 1 visit:", patients_with_counts.get(1, 0))
print("Number of patients with 2 visits:", patients_with_counts.get(2, 0))
print("Number of patients with 3 visits:", patients_with_counts.get(3, 0))
print("Number of patients with 4 visits:", patients_with_counts.get(4, 0))
print("Number of patients with 5 visits:", patients_with_counts.get(5, 0))

Number of patients with 1 visit: 0
Number of patients with 2 visits: 89
Number of patients with 3 visits: 40
Number of patients with 4 visits: 9
Number of patients with 5 visits: 4


In [21]:
# Filter only the patients with 3 visits

df = df_long.groupby('Subject ID').filter(lambda x: x['Visit'].nunique() == 3)
df

Unnamed: 0,Subject ID,Visit,MR Delay,Gender,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF,Group
4,OAS2_0005,1,0,1,80,12,4.0,28.0,1689,0.712,1.039,0
5,OAS2_0005,2,1010,1,83,12,4.0,29.0,1701,0.711,1.032,1
6,OAS2_0005,3,1603,1,85,12,4.0,30.0,1699,0.705,1.033,0
13,OAS2_0012,1,0,0,78,16,2.0,29.0,1333,0.748,1.316,0
14,OAS2_0012,2,730,0,80,16,2.0,29.0,1323,0.738,1.326,0
...,...,...,...,...,...,...,...,...,...,...,...,...
349,OAS2_0185,2,842,1,82,16,1.0,28.0,1693,0.694,1.037,1
350,OAS2_0185,3,2297,1,86,16,1.0,26.0,1688,0.675,1.040,1
351,OAS2_0186,1,0,0,61,13,2.0,30.0,1319,0.801,1.331,0
352,OAS2_0186,2,763,0,63,13,2.0,30.0,1327,0.796,1.323,0


In [22]:
df = df.reset_index(drop=True)

In [23]:
df

Unnamed: 0,Subject ID,Visit,MR Delay,Gender,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF,Group
0,OAS2_0005,1,0,1,80,12,4.0,28.0,1689,0.712,1.039,0
1,OAS2_0005,2,1010,1,83,12,4.0,29.0,1701,0.711,1.032,1
2,OAS2_0005,3,1603,1,85,12,4.0,30.0,1699,0.705,1.033,0
3,OAS2_0012,1,0,0,78,16,2.0,29.0,1333,0.748,1.316,0
4,OAS2_0012,2,730,0,80,16,2.0,29.0,1323,0.738,1.326,0
...,...,...,...,...,...,...,...,...,...,...,...,...
115,OAS2_0185,2,842,1,82,16,1.0,28.0,1693,0.694,1.037,1
116,OAS2_0185,3,2297,1,86,16,1.0,26.0,1688,0.675,1.040,1
117,OAS2_0186,1,0,0,61,13,2.0,30.0,1319,0.801,1.331,0
118,OAS2_0186,2,763,0,63,13,2.0,30.0,1327,0.796,1.323,0


### We perform prediction via different regression models and compare their 'correctness'
### We use 'ensemble' based techniques for regression like Random Forest Regressor, XGBoost Regressor and Gradient Boosting Regressor

In [24]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

In [25]:
# Feature Selection
features = ['MR Delay', 'Gender', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']

# Time Series Transformation
time_series_df = df.pivot(index='Subject ID', columns='Visit', values=features)

In [26]:
time_series_df

Unnamed: 0_level_0,MR Delay,MR Delay,MR Delay,Gender,Gender,Gender,Age,Age,Age,EDUC,...,MMSE,eTIV,eTIV,eTIV,nWBV,nWBV,nWBV,ASF,ASF,ASF
Visit,1,2,3,1,2,3,1,2,3,1,...,3,1,2,3,1,2,3,1,2,3
Subject ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
OAS2_0005,0.0,1010.0,1603.0,1.0,1.0,1.0,80.0,83.0,85.0,12.0,...,30.0,1689.0,1701.0,1699.0,0.712,0.711,0.705,1.039,1.032,1.033
OAS2_0012,0.0,730.0,1598.0,0.0,0.0,0.0,78.0,80.0,83.0,16.0,...,29.0,1333.0,1323.0,1323.0,0.748,0.738,0.718,1.316,1.326,1.327
OAS2_0013,0.0,643.0,1456.0,0.0,0.0,0.0,81.0,82.0,85.0,12.0,...,29.0,1230.0,1212.0,1225.0,0.715,0.72,0.71,1.427,1.448,1.433
OAS2_0018,0.0,489.0,1933.0,0.0,0.0,0.0,87.0,88.0,92.0,14.0,...,27.0,1406.0,1398.0,1423.0,0.715,0.713,0.696,1.248,1.255,1.234
OAS2_0020,0.0,756.0,1563.0,1.0,1.0,1.0,80.0,82.0,84.0,20.0,...,26.0,1587.0,1606.0,1597.0,0.693,0.677,0.666,1.106,1.093,1.099
OAS2_0031,0.0,446.0,1588.0,0.0,0.0,0.0,86.0,88.0,91.0,12.0,...,28.0,1430.0,1445.0,1463.0,0.718,0.719,0.696,1.227,1.215,1.199
OAS2_0040,0.0,567.0,1204.0,1.0,1.0,1.0,84.0,86.0,88.0,6.0,...,23.0,1310.0,1320.0,1348.0,0.727,0.724,0.713,1.339,1.329,1.302
OAS2_0041,0.0,756.0,1331.0,0.0,0.0,0.0,71.0,73.0,75.0,16.0,...,28.0,1289.0,1295.0,1314.0,0.771,0.768,0.76,1.362,1.356,1.335
OAS2_0044,0.0,352.0,866.0,1.0,1.0,1.0,68.0,69.0,71.0,14.0,...,22.0,1333.0,1331.0,1332.0,0.685,0.678,0.679,1.317,1.318,1.317
OAS2_0049,0.0,395.0,687.0,0.0,0.0,0.0,69.0,70.0,71.0,16.0,...,30.0,1491.0,1505.0,1503.0,0.794,0.791,0.788,1.177,1.166,1.168


In [27]:
# We take the medical data of patients for the first two visit as our features
X_1st = df[df['Visit'] == 1][features].values
X_2nd = df[df['Visit'] == 2][features].values
X = pd.concat([pd.DataFrame(X_1st), pd.DataFrame(X_2nd)], axis=1)

# And take the outcome of 3rd visit as the result
y = df[df['Visit'] == 3]['Group'].values

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

### For normal regression models like Logistic Regression, Support Vector Regression and Decision Tree Regression, and Ensemble techniques like AdaBoost, Random Forest, XGBoost and Gradient Boost

In [40]:
# Define hyperparameter values for Randomized Search for each model
param_dist_lr = {
    'C': [0.1, 1, 10]
}

param_dist_svr = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly']
}

param_dist_dt = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_dist_ada = {
    'n_estimators': np.arange(50, 251, 50),
    'learning_rate': [0.01, 0.1, 0.2, 0.3]
}

param_dist_gb = {
    'n_estimators': np.arange(50, 251, 50),
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2, 0.3]
}
param_dist_rf = {
    'n_estimators': np.arange(50, 251, 50),
    'max_depth': [3, 4, 5]
}

In [41]:
# Initialize models
models = {
    'Logistic Regression': (LogisticRegression(), param_dist_lr),
    'SVM Regression': (SVR(), param_dist_svr),
    'Decision Tree Regression': (DecisionTreeRegressor(), param_dist_dt),
    'AdaBoost': (AdaBoostRegressor(), param_dist_ada),
#     'XGBoost': (XGBRegressor(), param_dist_gb),
    'Gradient Boosting': (GradientBoostingRegressor(), param_dist_gb),
    'Random Forest': (RandomForestRegressor(), param_dist_rf)
}

In [42]:
best_models = {}

# Train models and find best parameters
for model_name, (model, param_dist) in models.items():
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=5)
    random_search.fit(X_train, y_train)
    best_params = random_search.best_params_
    best_models[model_name] = (random_search.best_estimator_, best_params)
    print(f"Best Parameters for {model_name}: {best_params}")

Best Parameters for Logistic Regression: {'C': 0.1}
Best Parameters for SVM Regression: {'kernel': 'rbf', 'C': 1}
Best Parameters for Decision Tree Regression: {'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 4}
Best Parameters for AdaBoost: {'n_estimators': 50, 'learning_rate': 0.1}
Best Parameters for Gradient Boosting: {'n_estimators': 150, 'max_depth': 3, 'learning_rate': 0.01}
Best Parameters for Random Forest: {'n_estimators': 200, 'max_depth': 4}


In [43]:
# Predict using the best models
predictions = {}
for model_name, (model, _) in best_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[model_name] = y_pred

### Visualizing the results

In [47]:
results = []
for model_name, y_pred in predictions.items():
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append((model_name, mse, r2))

In [49]:
# Create a DataFrame to display the results
results_df = pd.DataFrame(results, columns=['Model', 'MSE', 'R-squared'])
results_df

Unnamed: 0,Model,MSE,R-squared
0,Logistic Regression,0.5,-1.0
1,SVM Regression,0.406068,-0.624273
2,Decision Tree Regression,0.138889,0.444444
3,AdaBoost,0.063533,0.745868
4,Gradient Boosting,0.086321,0.654716
5,Random Forest,0.100921,0.596316
