In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


In [None]:
data = pd.read_csv()

# Hypothesis

# Problem

# EDA

## Data Frame check
1. using df.info(), look for the following:
    missing data (analyze if it need to be fill with 'mean', 'median' or 'other') compare the fillna method. select the one make minium bais
    incorrect data type (i.e., a column looks like a number but the data type in df.info() says it's an object) > this means there are data issues (special characters, text where it shouldn't be, etc.)

2. using df.describe(), look for the following:
    do the ranges make sense?
    are there outliers?
    which variables are categorical (even if there is a number indicator, like 1,2,3)
    which variables are continuous

In [None]:
data.info()

In [None]:
data.describe()

## Data cleaning

1. check how much of the TOTAL data is missing. If less than 10% of the total data points, then drop the missing data

2. if more than 10%:
Compare filling methods: Average/Median of the column OR Average/median of the column based on a category. Select the method that doesn't cause drastic differences in results
Explain your choice. Make sure that you have checked both methods before finalizing your choice. 

## Data exploration

1. for continuous variables: plot the boxplots with the x-axis as the output variable, and y-axis as the continuous variable
You are looking for difference in spread of data. For example, if output category 0 has a spread of 10-50, but output category 1 has a spread of 25-70, obviously the continuous variable depends differently depending on the output. So this would indicate that this variable is meaningful to the analysis.

2. for categorical variables: use groupby() to check proportions
Check whether the independent variable categories have different proportions to the dependent variable. For example, when comparing the titanic survival by gender, the first categorical variable is Gender and we're comparing it to a categorical output "Survived" - if we see that a higher proportion of Survivors are Women,  vs. the proportion of survivors who were women, then we can conclude gender has something to do with survival.

In [None]:
#Original data analysis numerical data
#Check the numeric data distrubution and the outliers 

# List of numerical columns
num_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']

# Plot histograms for numerical columns
for col in num_cols:
    plt.figure(figsize=(10, 5))
    sns.histplot(data[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()


# Plot boxplots of numerical columns against the target variable
for col in num_cols:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='cardio', y=col, data=data)
    plt.title(f'{col} vs cardio')
    plt.show()

In [None]:
#Original data analysis categorical data

# Select categorical columns
categorical_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

# Create bar plots for each categorical column
fig, axs = plt.subplots(3, 3, figsize=(15, 15))

for column, ax in zip(categorical_columns, axs.flatten()):
    data[column].value_counts().sort_index().plot(kind='bar', ax=ax, title=column)

# Remove unused subplots
for ax in axs.flatten()[len(categorical_columns):]:
    fig.delaxes(ax)

plt.tight_layout()
plt.show()

In [None]:
# List of categorical variables
cat_cols =['gender','cholesterol','gluc','smoke', 'alco','active']

# Plot countplots of categorical columns against the target variable
for col in cat_cols:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=col, hue='cardio', data= data)
    plt.title(f'{col} vs cardio')
    plt.show()

## data cleanning

In [None]:
# Compute the IQR for 'age'
Q1_age = data['age'].quantile(0.25)
Q3_age = data['age'].quantile(0.75)
IQR_age = Q3_age - Q1_age

# Define the boundaries for outliers
lower_bound = Q1_age - 1.5 * IQR_age
upper_bound = Q3_age + 1.5 * IQR_age

# Identify the outliers
outliers_age = data[(data['age'] < lower_bound) | (data['age'] > upper_bound)]

# Display the outliers
outliers_age


In [None]:
# Copy the original dataset
data_copy = data.copy()

# Define the thresholds
height_thresholds = [100, 220]
weight_thresholds = [40, 140]
ap_hi_thresholds = [70, 190]
ap_lo_thresholds = [40, 120]
bmi_thresholds = [15, 45]

# Calculate the number of outliers for each variable
height_outliers = data_copy[(data_copy['height'] < height_thresholds[0]) | (data_copy['height'] > height_thresholds[1])].shape[0]
weight_outliers = data_copy[(data_copy['weight'] < weight_thresholds[0]) | (data_copy['weight'] > weight_thresholds[1])].shape[0]
ap_hi_outliers = data_copy[(data_copy['ap_hi'] < ap_hi_thresholds[0]) | (data_copy['ap_hi'] > ap_hi_thresholds[1])].shape[0]
ap_lo_outliers = data_copy[(data_copy['ap_lo'] < ap_lo_thresholds[0]) | (data_copy['ap_lo'] > ap_lo_thresholds[1])].shape[0]
bmi_outliers = data_copy[(data_copy['bmi'] < bmi_thresholds[0]) | (data_copy['bmi'] > bmi_thresholds[1])].shape[0]

# Print the number of outliers for each variable
height_outliers, weight_outliers, ap_hi_outliers, ap_lo_outliers, bmi_outliers

In [None]:
# Remove age outliers
data = data[(data['age'] >= lower_bound) & (data['age'] <= upper_bound)]

# Remove height outliers
data = data[(data['height'] >= 100) & (data['height'] <= 220)]

# Remove weight outliers
data = data[(data['weight'] >= 40) & (data['weight'] <= 140)]

# Remove blood pressure outliers
data = data[(data['ap_hi'] >= 80) & (data['ap_hi'] <= 190)]
data = data[(data['ap_lo'] >= 50) & (data['ap_lo'] <= 120)]

# Remove BMI outliers
data = data[(data['bmi'] >= 15) & (data['bmi'] <= 45)]

# Show the all rows of  cleaned data
data

In [None]:
# Calculate correlation matrix
corr = data.corr()

# Plot heatmap of correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Correlation matrix of all variables')
plt.show()

# Feature Engineering

1. Generate new features from the data you have. This could include:
binning (i.e., translate a continuous variable into groups like 5-10, 15-20, etc.)
dummy variables (use one hot encoding, or pandas get_dummies() function to convert categorical variables to dummies)
define new metrics (e.g., multiply columns together or create custom categories based on multiple variables)

2. Check the relationship of engineered features to the output variable, using the methods outlined above

In [None]:
#BMI to weight measure 
def over_weight(x):
    if x >= 25:
        return 1
    elif x < 25:
        return 0

#apply function to data 
data['over_weight'] = data['bmi'].apply(over_weight)
data

# Step ? Build baseline Model

Logistic Regression, Naive Bayes, KNN, SVM, Decision Tree. 

Use the cross validation function to run each model 10 times and calculate an average performance. Remember to use F1 score in the cross validation function.

In [None]:
#Build Pipline
cat_columns = ['Pclass','Sex','Embarked']
num_columns = ['Age','SibSp','Parch','Fare']

cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, cat_columns),
                                                 ('num', num_transformer, num_columns)])

In [None]:
x= preprocessor.fit_transform(data[cat_columns + num_columns] )
y = data['']

In [None]:
#split the data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2 , random_state=1234)

In [None]:
# Build the model
log = LogisticRegression()
nb = GaussianNB()
knn = KNeighborsClassifier() #default neighbours is 5
svc = SVC()
dt = DecisionTreeClassifier(random_state=123)

In [None]:
# Cross-Validation 
cv_scores_log = cross_val_score(log, x, y, cv=10, scoring='f1').mean()
cv_scores_nb = cross_val_score(nb, x, y, cv=10, scoring='f1').mean()
cv_scores_knn = cross_val_score(knn, x, y, cv=10, scoring='f1').mean()
cv_scores_svc = cross_val_score(svc, x, y, cv=10, scoring='f1').mean()
cv_scores_dt = cross_val_score(dt, x, y, cv=10, scoring='f1').mean()

#Print out the mean f1 score to chose the best one
print("Cross-validation scores for Logistic Regression: ", cv_scores_log)
print("Cross-validation scores for Navie Bayes: ", cv_scores_nb)
print("Cross-validation scores for KNN: ", cv_scores_knn)
print("Cross-validation scores for SVC: ", cv_scores_svc)
print("Cross-validation scores for Decision Tree: ", cv_scores_dt)

By compare the f1_score choose a best model and start tuneing

# Model Tuning

Decide whether you want to optimize based on Precision or Recall. You will need to explain your choice in relation to the business objective.?

## Logistic Model Tuning

In [None]:
# Logistic Regression Tuning
log.fit(x_train,y_train)
#find posibility
pred_l_prob = log.predict_proba(x_test)

fpr, tpr, thresholds = roc_curve(y_test, pred_l_prob[:,1])
plt.plot(fpr, tpr)

gmeans = np.sqrt(tpr*(1-fpr))
thresholds[np.argmax(gmeans)]
#find the index with the highest gmean
print(thresholds[np.argmax(gmeans)])
print(gmeans[np.argmax(gmeans)])

In [None]:
#change treshhold
y_pred_new = []
for i in pred_l_prob[:,1]:
    if i < 0.36829385617580274:
        y_pred_new.append(0)
    else:
        y_pred_new.append(1)

In [None]:
print('accuracy_score:', accuracy_score(y_pred_new,data['cardio']))
print('f1_score', f1_score(y_pred_new,data['cardio']))
print('precision_score:' , precision_score(y_pred_new,data['cardio']))
print('recall_score:' , recall_score(y_pred_new,data['cardio']))
print('roc_auc score', roc_auc_score(y_test, y_pred_new)) 

## KNN tuning

In [None]:
# knn tuning

# Select parameters find the best parameters
knn_params = {'n_neighbors':range(1,200), 'weight':['uniform','distance'],'metric':['euclidean','manhattan']}
#define randomized search
rs_knn = RandomizedSearchCV(knn, knn_params, n_iter=100, cv=5, scoring='f1')
#fit data in randomized search
rs_knn.fit(x_train,y_train)
rs_knn.best_params_

In [None]:
#Put the new parameters in the 
knn_best = KNeighborsClassifier(**rs_knn.best_params_)
knn_best.fit(x_train,y_train)
pred_knn_best = knn_best.predict(x_test)

In [None]:
#check the score
y_pred_prob = knn_best.predict_proba(x_test)
print('accuracy_score:', accuracy_score(pred_knn_best,data['cardio']))
print('f1_score', f1_score(pred_knn_best,data['cardio']))
print('precision_score:' , precision_score(pred_knn_best,data['cardio']))
print('recall_score:' , recall_score(pred_knn_best,data['cardio']))
print('roc_auc score', roc_auc_score(y_test, y_pred_prob)) #有没有多种情况

## SVC model tuning

In [None]:
#svc tuning
svc_params = {     'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf','sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1]}
rs_svc = RandomizedSearchCV(svc, svc_params, n_iter=100, cv=5, scoring='f1')
rs_svc.fit(x_train, y_train)
rs_svc.best_params_

# change the hyperparameter of svc get best_svc
svc_best = SVC(**rs_svc.best_params_)
svc_best.fit(x_train, y_train)
pre_svc_best = svc_best.predict(x_test)


#check the score
y_pred_prob = svc_best.predict_proba(x_test)
print('accuracy_score:', accuracy_score(pre_svc_best,y_test))
print('f1_score', f1_score(pre_svc_best,data['cardio']))
print('precision_score:' , precision_score(pre_svc_best,y_test))
print('recall_score:' , recall_score(pre_svc_best,y_test))
print('roc_auc score', roc_auc_score(y_test, y_pred_prob))

## Navie Bayes Model and Tunning

In [None]:
#Gnb tuning
#Create parameter list
params_nb = {'var_smoothing': np.logspace(0,-9, num=100)}
# Define the models
nb = GaussianNB()
# Define GridSearchCV
rs_nb = RandomizedSearchCV(nb, params_nb, cv=5, scoring='recall')
# Fit models
rs_nb.fit(x_train, y_train)
print("Best parameters for KNN: ", rs_nb.best_params_)

In [None]:
nb_best = GaussianNB(**rs_nb.best_params_)

nb_best.fit(x_train, y_train)
pre_nb_best = nb_best.predict(x_test)


#check the score
y_pred_prob = nb_best.predict_proba(x_test)
print('accuracy_score:', accuracy_score(pre_nb_best,y_test))
print('f1_score', f1_score(pre_nb_best,data['cardio']))
print('precision_score:' , precision_score(pre_nb_best,y_test))
print('recall_score:' , recall_score(pre_nb_best,y_test))
print('roc_auc score', roc_auc_score(y_test, y_pred_prob))

## Decision Tree

In [None]:
# Hyperparameter tuning
# select parameters
param_dt = {'max_depth': list(range(1, 10)),
          'min_samples_leaf': list(range(1, 10)),
          'max_features': ['auto', 'sqrt', 'log2']}
# Define the models
dt = dt(kernel='linear')

# Define GridSearchCV /Create the RandomizedSearchCV object
gridsearch_dt = GridSearchCV(dt, param_dt, cv=10, scoring='recall') # --> here choose based on the decition made before
# If your computer doesn't have much memory
randomsearch_dt = RandomizedSearchCV(dt, param_dt, cv=10, n_iter= 15, scoring='f1')

# Fit models
gridsearch_dt.fit(X_train_transformed, y_train)
randomsearch_dt.fit(X_train_transformed, y_train)

# Get the best parameters
print("Best parameters for dt - gridsearch: ", gridsearch_dt.best_params_)
print("Best parameters for dt - randomsearch: ", randomsearch_dt.best_params_)

# Apply the best parameters
dt_best = dt(**gridsearch_dt.best_params_)

# Perform cross-validation
cv_scores_dt = cross_val_score(dt_best, X_train_transformed, y_train, cv=10, scoring='roc_auc')
print("Cross-validation scores for SVC: ", cv_scores_dt)

# Evaluate on the test set
pred_dt=dt_best.predict(X_test_transformed)
y_pred_prob = dt_best.predict_proba(X_test_transformed)

print(f'ROC AUC Score: {roc_auc_score(y_test, y_pred_prob[:,1])}')
print("optimal threshold F1: ", f1_score(y_test, pred_dt))
print("optimal threshold precision: ", precision_score(y_test, pred_dt))
print("optimal threshold recall: ", recall_score(y_test, pred_dt))
print("optimal threshold accuracy: ", accuracy_score(y_test, pred_dt))