In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

In [2]:
# Dataset source: https://www.kaggle.com/ronitf/heart-disease-uci
# For the 'target' column, 0 corresponds to a healthy heart and 1 corresponds to a defective heart.
data = pd.read_csv("heart.csv")

In [3]:
x = data.drop("target", axis=1)
y = data["target"]

# Stratified train and test split

In [4]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, stratify=y, random_state=1)

In [5]:
scaler = StandardScaler()
scaled_train_x = scaler.fit_transform(train_x)
scaled_test_x = scaler.fit_transform(test_x)

In [6]:
# model = LogisticRegression()
# Training accuracy: 0.8419117647058824
# Testing accuracy: 0.9032258064516129

# model = DecisionTreeRegressor()
# Training accuracy: 1.0
# Testing accuracy: 0.8387096774193549

model = RandomForestClassifier(max_depth=4, n_estimators=50)
# Training accuracy: 1.0
# Testing accuracy: 0.9032258064516129

# model = KNeighborsClassifier(algorithm="brute", n_jobs=-1)
# Training accuracy: 0.8584905660377359
# Testing accuracy: 0.8241758241758241

# model = LinearSVC(C=0.0001)
# Training accuracy: 0.8308823529411765
# Testing accuracy: 0.8709677419354839

In [7]:
model.fit(scaled_train_x, train_y)

LinearSVC(C=0.0001)

In [8]:
x_prediction = model.predict(scaled_train_x)
training_data_acc = accuracy_score(x_prediction, train_y)
training_data_acc

0.8308823529411765

In [9]:
x_prediction = model.predict(scaled_test_x)
testing_data_acc = accuracy_score(x_prediction, test_y)
testing_data_acc

0.8709677419354839

## Function to estimate best parameters for the model

## Function to estimate the best test size for model

In [None]:
model = LogisticRegression()
# Training accuracy: 0.8636363636363636
# Testing accuracy: 0.9032258064516129

# model = DecisionTreeRegressor()
# Training accuracy: 1.0
# Testing accuracy: 0.7741935483870968

# model = RandomForestClassifier(max_depth=4, n_estimators=50)
# Training accuracy: 1.0
# Testing accuracy: 0.9032258064516129

# model = KNeighborsClassifier(algorithm="brute", n_jobs=-1)
# Training accuracy: 0.8636363636363636
# Testing accuracy: 0.8241758241758241 0.3

# model = LinearSVC(C=0.0001)
# Training accuracy: 0.8429752066115702
# Testing accuracy: 0.8709677419354839

max_acc = 0
t_size = 0
scaler = StandardScaler()
for i in range(1,10,1):
    x = data.drop("target", axis=1)
    y = data["target"]
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=(i/10), stratify=y, random_state=1)
    scaled_train_x = scaler.fit_transform(train_x)
    scaled_test_x = scaler.fit_transform(test_x)
    model.fit(scaled_train_x, train_y)
    x_prediction = model.predict(scaled_test_x)
    testing_data_acc = accuracy_score(x_prediction, test_y)
    if testing_data_acc > max_acc:
        max_acc = testing_data_acc
        t_size = i
print(t_size, max_acc)