In [2]:
import pandas as pd

# Load the data from the provided CSV files
train_data = pd.read_csv('train.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

# Display the first few rows of each dataset
train_data.head() 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,29,1,2,130,204,0,2,202,0,0.0,1,0.0,3.0,0
1,57,1,3,150,126,1,0,173,0,0.2,1,1.0,7.0,0
2,69,1,1,160,234,1,2,131,0,0.1,2,1.0,3.0,0
3,65,0,3,160,360,0,2,151,0,0.8,1,0.0,3.0,0
4,52,1,4,108,233,1,0,147,0,0.1,1,3.0,7.0,0


In [3]:
validation_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,60,1,4,125,258,0,2,141,1,2.8,2,1.0,7.0,1
1,43,1,4,115,303,0,0,181,0,1.2,2,0.0,3.0,0
2,57,1,4,165,289,1,2,124,0,1.0,2,3.0,7.0,1
3,58,1,3,140,211,1,2,165,0,0.0,1,0.0,3.0,0
4,34,1,1,118,182,0,2,174,0,0.0,1,0.0,3.0,0


In [4]:
test_data.head()


Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,179,53,1,3,130,246,1,2,173,0,0.0,1,3.0,3.0
1,9,53,1,4,140,203,1,2,155,1,3.1,3,0.0,7.0
2,281,47,1,3,130,253,0,0,179,0,0.0,1,0.0,3.0
3,284,61,1,4,148,203,0,0,161,0,0.0,1,1.0,7.0
4,175,57,1,4,152,274,0,0,88,1,1.2,2,1.0,7.0


In [5]:
import sklearn

In [6]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [8]:
X_train = train_data.drop(columns='label')
y_train = train_data['label']
X_val = validation_data.drop(columns='label')
y_val = validation_data['label']

from sklearn.impute import SimpleImputer

# Define a pipeline with imputation, scaling, and SVM
svm_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant')),  # Impute missing values with the mean
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', gamma='scale', C=1.0)) #eventually try to add linear with poly... poly with degree is best so far
])

# Train the SVM model
svm_pipeline.fit(X_train, y_train)

# Predict on the validation dataset
val_predictions = svm_pipeline.predict(X_val)

# Calculate the accuracy on the validation set
validation_accuracy = accuracy_score(y_val, val_predictions)
print("accuracy", validation_accuracy)


TypeError: unsupported operand type(s) for +: 'SVC' and 'SVC'

In [11]:
# Prepare test data by dropping the 'id' column which is not needed for prediction
X_test = test_data.drop(columns='id')

# Predict on the test dataset using the trained SVM pipeline
test_predictions = svm_pipeline.predict(X_test)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'label': test_predictions
})

# Save the submission file
submission_file_path = 'submission.csv'
submission.to_csv(submission_file_path, index=False)

submission_file_path, submission.head()


('submission.csv',
     id  label
 0  179      0
 1    9      1
 2  281      0
 3  284      1
 4  175      1)

In [17]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for C and gamma
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [0.001, 0.01, 0.1, 1, 'scale', 'auto']
}

# Setup GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Best parameters and best score from grid search
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params, best_score


Fitting 5 folds for each of 24 candidates, totalling 120 fits


({'svm__C': 1, 'svm__gamma': 0.01}, 0.8304421768707483)

In [19]:
# Update the SVM pipeline with the best parameters found from grid search
optimized_svm_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', gamma=best_params['svm__gamma'], C=best_params['svm__C']))
])

# Retrain the model with the optimized parameters
optimized_svm_pipeline.fit(X_train, y_train)

# Predict on the validation dataset
optimized_val_predictions = optimized_svm_pipeline.predict(X_val)

# Calculate the accuracy on the validation set
optimized_validation_accuracy = accuracy_score(y_val, optimized_val_predictions)

# Predict on the test dataset using the optimized SVM pipeline
optimized_test_predictions = optimized_svm_pipeline.predict(X_test)

# Prepare the optimized submission DataFrame
optimized_submission = pd.DataFrame({
    'id': test_data['id'],
    'label': optimized_test_predictions
})

# Save the optimized submission file
optimized_submission_file_path = 'hyper_submission.csv'
optimized_submission.to_csv(optimized_submission_file_path, index=False)

optimized_validation_accuracy, optimized_submission_file_path, optimized_submission.head()


(0.9333333333333333,
 'hyper_submission.csv',
     id  label
 0  179      0
 1    9      1
 2  281      0
 3  284      1
 4  175      1)

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Imputers and scalers setup
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

# Columns for imputation and scaling
columns_to_impute = ['ca', 'thal']
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Impute missing values in train, validation, and test datasets
train_data[columns_to_impute] = imputer.fit_transform(train_data[columns_to_impute])
validation_data[columns_to_impute] = imputer.transform(validation_data[columns_to_impute])
test_data[columns_to_impute] = imputer.transform(test_data[columns_to_impute])

# Scale specified columns in train, validation, and test datasets
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])
validation_data[columns_to_scale] = scaler.transform(validation_data[columns_to_scale])
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Check if the transformations were applied correctly
train_data.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,-2.838504,1,2,-0.125982,-0.864142,0,2,2.31447,0,-0.873573,1,0.0,3.0,0
1,0.241352,1,3,0.974653,-2.483637,1,0,1.021242,0,-0.704854,1,1.0,7.0,0
2,1.561291,1,1,1.52497,-0.24126,1,2,-0.85171,0,-0.789214,2,1.0,3.0,0
3,1.121311,0,3,1.52497,2.374848,0,2,0.040172,0,-0.198698,1,0.0,3.0,0
4,-0.308622,1,4,-1.33668,-0.262023,1,0,-0.138205,0,-0.789214,1,3.0,7.0,0


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [24]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = rf_classifier.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)

validation_accuracy

0.8666666666666667