# Alzheimer's Prediction Model
Exploratory Data analysis and modeling 

In [0]:
import datetime
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, to_utc_timestamp
from sklearn.model_selection import train_test_split
import pandas as pd


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


from sklearn.linear_model import LogisticRegression

# Accuracy evaluator, f1, recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

# Classification summary
from sklearn.metrics import classification_report



In [0]:
# Load configuration
with open("../project_config.yml", "r") as file:
    config = yaml.safe_load(file)

catalog_name = config["catalog_name"]
schema_name = config["schema_name"]

In [0]:
# Load dataset
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv(f"/Volumes/{catalog_name}/{schema_name}/data/alzheimers_prediction_dataset.csv", header=True, inferSchema=True).toPandas()

In [0]:
# Describe with categorical values and numerical values
df.describe(include='all')

In [0]:
df.sample(10)

In [0]:
# Make columns SQL friendly
df.columns = [column.replace(" ","_").lower().replace("’","") for column in df.columns]
df.columns

In [0]:
df.dtypes

In [0]:
num_features = df.select_dtypes(include=['int32','float64']).columns.tolist()
cat_features = df.select_dtypes(include=['object']).columns.tolist()
cat_features.remove(target)
for cat_col in cat_features:
    df[cat_col] = df[cat_col].astype("category")
target = 'alzheimers_diagnosis'

In [0]:
df[target].apply(lambda x: 1 if x == 'Yes' else 0)

In [0]:
features = num_features + cat_features

X = df[features]
y = df[target].apply(lambda x: 1 if x == 'Yes' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [0]:
y_train

In [0]:
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)],
    remainder='passthrough'
)

transformed_df_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())
transformed_df_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [0]:
model = LogisticRegression(max_iter=10000)
model.fit(transformed_df, y_train)

In [0]:
y_pred = model.predict(transformed_df_test)


In [0]:
print(classification_report(y_true=y_test, y_pred=y_pred))

## Feature Importances

In [0]:

pd.DataFrame(model.coef_[0], model.feature_names_in_, columns=['coefficient']).sort_values(by='coefficient', ascending=True).head(80).plot.barh(figsize=(10,20))

There are several types of caregorical features: 
  - Pure categories: Country, Marital Status. 
  - Boolean: Hypertension, Diabetes. 
  - Intensity features: Cholesterol level, Physical activity, Stress level. This sort of factors can be transformed into numeric
