In [54]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [23]:
csec = pd.read_csv('oasis_cross-sectional.csv')

In [25]:
csec.head()

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
0,OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,
1,OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.81,1.531,
2,OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,
3,OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,
4,OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.01,


In [26]:
csec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436 entries, 0 to 435
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      436 non-null    object 
 1   M/F     436 non-null    object 
 2   Hand    436 non-null    object 
 3   Age     436 non-null    int64  
 4   Educ    235 non-null    float64
 5   SES     216 non-null    float64
 6   MMSE    235 non-null    float64
 7   CDR     235 non-null    float64
 8   eTIV    436 non-null    int64  
 9   nWBV    436 non-null    float64
 10  ASF     436 non-null    float64
 11  Delay   20 non-null     float64
dtypes: float64(7), int64(2), object(3)
memory usage: 41.0+ KB


In [30]:
#target CDR
df = csec[~csec.CDR.isna()]

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 235 entries, 0 to 415
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      235 non-null    object 
 1   M/F     235 non-null    object 
 2   Hand    235 non-null    object 
 3   Age     235 non-null    int64  
 4   Educ    235 non-null    float64
 5   SES     216 non-null    float64
 6   MMSE    235 non-null    float64
 7   CDR     235 non-null    float64
 8   eTIV    235 non-null    int64  
 9   nWBV    235 non-null    float64
 10  ASF     235 non-null    float64
 11  Delay   0 non-null      float64
dtypes: float64(7), int64(2), object(3)
memory usage: 23.9+ KB


## Data Variables Descriptions

| Variable   | Description                                           |
|------------|-------------------------------------------------------|
| Subject.ID | MRI Identification                          |
## Demographics Info

| Variable | Description                                  |
|----------|----------------------------------------------|
| M.F      | Gender                                       |
| Hand     | Handedness                  |
| Age      | Age                                          |
| EDUC     | Years of education                            |
| SES      | Socioeconomic status (Hollingshead Index)    |

## Clinical Info

| Variable | Description                                              |
|----------|----------------------------------------------------------|
| MMSE     | Mini-Mental State Examination score (0 to 30)            |

## Derived Anatomic Volumes

| Variable | Description                                         |
|----------|-----------------------------------------------------|
| eTIV     | Estimated total intracranial volume (mm3)           |
| nWBV     | Normalized whole-brain volume (percent)             |
| ASF      | Atlas scaling factor (unitless)                    |


Target - CDR (Clinical Dementia Rating)

1. **CDR 0:** No impairment
2. **CDR 0.5:** Very mild impairment
3. **CDR 1:** Mild impairment
4. **CDR 2:** Moderate impairment
5. **CDR 3:** Severe impairment

In [49]:
df.CDR.value_counts()

0.0    135
0.5     70
1.0     28
2.0      2
Name: CDR, dtype: int64

### As observed, there are no patients with a severe level of the disease, and only two exhibit a moderate level. Consequently, we can reclassify the target variable to develop a model that predicts whether a patient has no impairment or exhibits a specific level of impairment.

In [50]:
df['target'] = np.where(df.CDR==0, 0, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = np.where(df.CDR==0, 0, 1)


In [51]:
df.target.value_counts()

0    135
1    100
Name: target, dtype: int64

In [52]:
df.head()

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay,target
0,OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,,0
1,OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.81,1.531,,0
2,OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,,1
8,OAS1_0010_MR1,M,R,74,5.0,2.0,30.0,0.0,1636,0.689,1.073,,0
9,OAS1_0011_MR1,F,R,52,3.0,2.0,30.0,0.0,1321,0.827,1.329,,0


#### not using Delay since all variables are na,
#### not using Hands since all samples are right handed

In [53]:
# Define columns based on data types
numeric_features = ['Age', 'Educ','SES','MMSE','eTIV', 'nWBV','ASF']
categorical_features = ['M/F']
X = df[numeric_features+categorical_features]
y = ['CDR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='if_binary', sparse=False))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]).set_output(transform='pandas')

# Create a pipeline with the preprocessing steps
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data using the pipeline
X_train_tr = pipeline.fit_transform(X_train)
X_test_rt = pipeline.transform(X_test)


# Display the transformed data
X_test_rt.head()




Unnamed: 0,num__Age,num__Educ,num__SES,num__MMSE,num__eTIV,num__nWBV,num__ASF,cat__M/F_M
0,0.137193,-0.900671,0.475123,0.52485,-0.721742,-0.128273,0.692903,0.0
1,-1.43753,0.627544,-1.390814,0.52485,-1.952788,1.27329,2.435043,0.0
2,0.054313,0.627544,0.475123,-0.017303,-0.034356,-0.860432,-0.073639,0.0
8,0.137193,1.391651,-0.457846,0.795927,1.102956,-1.25789,-1.111181,1.0
9,-1.68617,-0.136564,-0.457846,0.795927,-0.865468,1.62891,0.870988,0.0


# Use cross validation when building model, pipeline object can be passed through cross_val_score or used in any custom cross validation code